From 2ffb337183f8d970c8b6eca002963061f48afba6 Mon Sep 17 00:00:00 2001
From: Zuza Gawrysiak <zuzanna.gawrysiak@intel.com>
Date: Sun, 22 May 2022 16:35:19 +0200
Subject: [PATCH 001/109] Quantize elementwise sub (#42854)

* Add elementwise_sub quantization

* Remove unnecessary comments

* Specify names for tests

* Remove comments

* Remove comments leftovers
---
 .../framework/ir/graph_pattern_detector.h     |  2 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  1 +
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     | 60 +++++++++----------
 .../ir/mkldnn/cpu_quantize_placement_pass.cc  |  7 ++-
 .../int8_scale_calculation_mkldnn_pass.h      |  4 --
 5 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index cab8f82660d90..3c6b6ce94e23f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1057,7 +1057,7 @@ struct Pool : public PatternBase {
 };
 
 // Elementwise ops
-// Forward pass for element-wise operators (add, mul)
+// Forward pass for element-wise operators
 // elementwise_out is the result of the operator
 struct Elementwise : public PatternBase {
   Elementwise(PDPattern* pattern, const std::string& name_scope)
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 4aae60b853d4f..a61c043b58065 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1188,6 +1188,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeMatmul(graph);
   QuantizeElementwise(graph, "elementwise_add");
   QuantizeElementwise(graph, "elementwise_mul");
+  QuantizeElementwise(graph, "elementwise_sub");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 22000865948d6..912c16288c2b9 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -90,7 +90,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add" || type == "elementwise_mul") {
+  } else if (type == "elementwise_add" || type == "elementwise_mul" ||
+             type == "elementwise_sub") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
@@ -168,7 +169,7 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
   } else if (type == "matmul" || type == "elementwise_add" ||
-             type == "elementwise_mul") {
+             type == "elementwise_mul" || type == "elementwise_sub") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
@@ -565,60 +566,59 @@ ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type,
   return prog;
 }
 
-void TestElementwise(const std::string elementwise_type,
-                     const std::string elementwise_name) {
+void TestElementwise(std::vector<std::string> elementwise) {
   // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
   int added_nodes = 6;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 2}, {"dequantize", 3}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes,
            SCALE * S8_MAX);
 }
 
-void TestElementwiseOutputScaleMissing(const std::string elementwise_type,
-                                       const std::string elementwise_name) {
+void TestElementwiseOutputScaleMissing(std::vector<std::string> elementwise) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes, 1.f,
            1.f, "e");
 }
 
-void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type,
-                                           const std::string elementwise_name) {
+void TestElementwiseUnsignedAndSignedInput(
+    std::vector<std::string> elementwise) {
   int added_nodes = 0;
   std::unordered_map<std::string, int> expected_operators = {
-      {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}};
-  MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name),
+      {elementwise[0], 1}, {"quantize", 0}, {"dequantize", 2}};
+  MainTest(BuildProgramDescElementwise(elementwise[0], elementwise[1]),
            variable_names_elementwise, expected_operators, added_nodes, 1.f,
            1.f, "", "b");
 }
 
-TEST(CpuQuantizePass, elementwise_add) {
-  TestElementwise("elementwise_add", "ElementwiseAdd");
-}
+const std::vector<std::vector<std::string>> elementwises = {
+    {"elementwise_add", "ElementwiseAdd"},
+    {"elementwise_mul", "ElementwiseMul"},
+    {"elementwise_sub", "ElementwiseSub"}};
 
-TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
-  TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd");
-}
+class TestElementwises
+    : public testing::TestWithParam<std::vector<std::string>> {};
 
-TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
-  TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd");
-}
+TEST_P(TestElementwises, elementwise_basic) { TestElementwise(GetParam()); }
 
-TEST(CpuQuantizePass, elementwise_mul) {
-  TestElementwise("elementwise_mul", "ElementwiseMul");
+TEST_P(TestElementwises, elementwise_output_scale_missing) {
+  TestElementwiseOutputScaleMissing(GetParam());
 }
 
-TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) {
-  TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul");
+TEST_P(TestElementwises, elementwise_unsigned_and_signed_input) {
+  TestElementwiseUnsignedAndSignedInput(GetParam());
 }
 
-TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) {
-  TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul");
-}
+INSTANTIATE_TEST_CASE_P(
+    Elementwises, TestElementwises, testing::ValuesIn(elementwises),
+    [](const ::testing::TestParamInfo<TestElementwises::ParamType>& info) {
+      std::string name = info.param[0];
+      return name;
+    });
 
 const std::vector<std::string> churn_out_vars(ProgramDesc* prog,
                                               const std::string& prefix,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 3b883dac9782a..5b606a89ac90a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -27,9 +27,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
           {"concat", "conv2d", "depthwise_conv2d", "elementwise_add",
-           "elementwise_mul", "fc", "matmul", "nearest_interp",
-           "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2",
-           "fusion_gru", "fusion_lstm", "multi_gru", "slice"});
+           "elementwise_mul", "elementwise_sub", "fc", "matmul",
+           "nearest_interp", "nearest_interp_v2", "pool2d", "prior_box",
+           "reshape2", "transpose2", "fusion_gru", "fusion_lstm", "multi_gru",
+           "slice"});
   const auto& excluded_ids_list =
       Get<std::unordered_set<int>>("quantize_excluded_op_ids");
   const auto& op_types_list =
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
index 9233650a2db3c..383c4f40fc03d 100644
--- a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
@@ -14,10 +14,6 @@
 
 #pragma once
 
-// #include <memory>
-// #include <string>
-// #include <unordered_map>
-
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 
 namespace paddle {

From 106083aa5f9641af029f7a678533cfb494a1c236 Mon Sep 17 00:00:00 2001
From: shixingbo <90814748+bmb0537@users.noreply.github.com>
Date: Mon, 23 May 2022 10:42:02 +0800
Subject: [PATCH 002/109] Fix a bug in BroadcastConfig for KP XPU2 rec model  
 (#42866)

---
 .../phi/kernels/primitive/datamover_primitives_xpu2.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 3799b9d4892f8..1e5dfe2a542b0 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -70,6 +70,7 @@ struct BroadcastConfig {
   int strides_out[phi::DDim::kMaxRank];
   int in_dim[phi::DDim::kMaxRank];
   int dim_after_cmp[phi::DDim::kMaxRank];
+  int y_dim_after_cmp[phi::DDim::kMaxRank];
   int dim_size_after_cmp = 0;
   int cmp_res = 0;
   OptType cmp_type = OptType::CanNotOptimize;
@@ -82,7 +83,7 @@ struct BroadcastConfig {
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
                              const std::vector<int64_t>& in_dims,
-                             const std::vector<int64_t>& another_in_dims,
+                             const std::vector<int64_t>& y_in_dims,
                              int dim_size) {
     std::vector<int> strides_in_tmp;
     std::vector<int> strides_out_tmp;
@@ -103,8 +104,8 @@ struct BroadcastConfig {
     memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int));
     memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int));
 
-    cmp_res = get_mnk_for_broadcast_ops(in_dims, another_in_dims);
-    get_opt_type(another_in_dims);
+    cmp_res = get_mnk_for_broadcast_ops(in_dims, y_in_dims);
+    get_opt_type();
     buf_len = get_buf_len();
   }
 
@@ -154,7 +155,7 @@ struct BroadcastConfig {
     return index_src;
   }
 
-  void get_opt_type(const std::vector<int64_t>& y_dim_after_cmp) {
+  void get_opt_type() {
     if (dim_size_after_cmp == 1) {
       if (dim_after_cmp[0] == 1 && y_dim_after_cmp[0] != 1) {  // {1} op {n}
         n = y_dim_after_cmp[0];
@@ -241,6 +242,7 @@ struct BroadcastConfig {
     int cmp_x = 0;
     int cmp_y = 0;
     bool is_same = false;
+
     std::vector<int64_t> xshape_after_remove_ones = xshape;
     std::vector<int64_t> yshape_after_remove_ones = yshape;
     // first step: remove excess ones
@@ -275,6 +277,7 @@ struct BroadcastConfig {
       }
       idx = idx + 1;
       dim_after_cmp[after_cmp_idx] = cmp_x;
+      y_dim_after_cmp[after_cmp_idx] = cmp_y;
       after_cmp_idx++;
       if (idx == xshape_after_remove_ones.size()) {
         dim_size_after_cmp = after_cmp_idx;

From 9827c8b58b8cac88ae0db47aa193891f221ce5cb Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 23 May 2022 10:46:33 +0800
Subject: [PATCH 003/109] improve error info when no sample code found (#42742)

* test=document_fix

* exit 1 if no sample code found since api must have sample code;test=document_fix

* test normal input;test=document_fix

* delete test code;test=document_fix
---
 tools/sampcd_processor.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 2d8692c5bc7e5..13005350d7bd5 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -339,8 +339,10 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
 Please use '.. code-block:: python' to format the sample code.""")
                 return []
         else:
-            logger.warning("Error: No sample code!")
-            return []
+            logger.error(
+                "Error: No sample code found! Please check if the API comment contais string 'Examples:' correctly"
+            )
+            exit(1)
 
     sample_code_filenames = []
     for y, cb in enumerate(codeblocks):

From 2b4977f20cbe962599c55ab57c99f0c2043bf478 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 23 May 2022 11:19:54 +0800
Subject: [PATCH 004/109] fix final_state_linear (#42820)

---
 paddle/fluid/pybind/eager_custom_python_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index c509ab5674930..99ec4212918de 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -65,7 +65,7 @@ static PyObject *eager_api_final_state_linear(PyObject *self, PyObject *args,
     if (bias.initialized()) {
       auto mm_out =
           matmul_final_state_dygraph_function(x, weight, false, false);
-      auto out = add_final_state_dygraph_function(bias, mm_out);
+      auto out = add_final_state_dygraph_function(mm_out, bias);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(out);

From 3b488baea74edbffe895be7b42801edab57513ec Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Mon, 23 May 2022 11:48:42 +0800
Subject: [PATCH 005/109] remove is_init_py of RandomGenerator, and use Global
 RandomGenerator by default (#42876)

* remove is_init_py of RandomGenerator, and use Global Generator if not OP seed

* fix comment
---
 paddle/fluid/framework/generator.cc           | 34 ++++--------------
 paddle/fluid/framework/generator.h            | 15 +-------
 .../fluid/operators/class_center_sample_op.cu |  7 ++--
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    | 16 ++++-----
 paddle/fluid/operators/dirichlet_op.cu        |  2 +-
 paddle/fluid/operators/dropout_impl_util.h    |  7 ++--
 paddle/fluid/operators/gaussian_random_op.cu  | 17 ++++-----
 paddle/fluid/operators/uniform_random_op.h    | 13 +++----
 paddle/fluid/platform/device_context.cc       |  2 +-
 paddle/fluid/pybind/generator_py.cc           |  8 ++---
 paddle/phi/core/generator.h                   |  6 ----
 .../phi/kernels/gpu/gaussian_random_kernel.cu | 26 ++++----------
 .../phi/kernels/gpu/gumbel_softmax_kernel.cu  | 35 +++++++------------
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       | 16 ++++-----
 .../gpu/truncated_gaussian_random_kernel.cu   | 35 +++++++------------
 .../phi/kernels/gpu/uniform_random_kernel.cu  | 13 ++-----
 .../tests/unittests/test_cuda_random_seed.py  | 20 +++++------
 .../tests/unittests/test_rnn_decode_api.py    |  1 -
 .../unittests/test_uniform_random_bf16_op.py  |  1 -
 .../tests/unittests/test_uniform_random_op.py |  1 -
 python/paddle/framework/random.py             |  2 --
 21 files changed, 85 insertions(+), 192 deletions(-)

diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 2bd8ed900f102..b621eca35b893 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   static int64_t num_cuda_devices = -1;
@@ -58,8 +58,6 @@ const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
   static auto default_cpu_generator =
       std::make_shared<Generator>(GetRandomSeed());
-  VLOG(4) << "initial seed: " << default_cpu_generator->GetCurrentSeed()
-          << ", cpu engine: " << default_cpu_generator->GetCPUEngine().get();
   return default_cpu_generator;
 }
 
@@ -100,19 +98,13 @@ const std::shared_ptr<Generator>& GetRandomSeedGenerator(
   return iter->second;
 }
 
-std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine() {
-  static auto op_default_cpu_engine = std::make_shared<std::mt19937_64>();
-  return op_default_cpu_engine;
-}
-
-// NOTE(zhiqiu): there are 3 conditions:
-// (1) op seed is not set and DefaultCPUGenerator is inited, use
-// DefaultCPUGenerator
-// (2) op seed is not set and DefaultCPUGenerator is not inited, use se
-// OpDefaultCPUEngine() and set a radnom seed
-// (3) op seed is set, use OpDefaultCPUEngine() and set the seed
+// There are 3 conditions:
+// (1) op seed is set, use op seed.
+// (2) op seed is not set, global seed is set, use global seed.
+// (3) op seed is not set, global seed is not set too, use random seed from
+// RandomGenerator.
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
-  if (DefaultCPUGenerator()->GetIsInitPy() && seed == 0) {
+  if (seed == 0) {
     VLOG(4) << "Use random engine from generator";
     return DefaultCPUGenerator()->GetCPUEngine();
   } else {
@@ -123,12 +115,6 @@ std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
     //
     // And we need to measure the determinacy of Generator in PE.
     auto engine = std::make_shared<std::mt19937_64>();
-    if (seed == 0) {
-      seed = GetRandomSeed();
-      VLOG(4) << "Use default random engine with random seed = " << seed;
-    } else {
-      VLOG(4) << "Use default random engine with fixed random seed = " << seed;
-    }
     static std::mutex mu_;
     {
       std::lock_guard<std::mutex> lock(mu_);
@@ -204,11 +190,5 @@ std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
 #endif
 }
 
-void Generator::SetIsInitPy(bool is_init_py) {
-  this->is_init_py_ = is_init_py;
-  VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
-}
-bool Generator::GetIsInitPy() const { return this->is_init_py_; }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 1c19234bf7d80..35efc1bee33d5 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -59,7 +59,6 @@ struct Generator : public phi::Generator {
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
-    this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
   Generator(uint64_t seed, uint64_t device_id) {
     std::seed_seq seq({seed});
@@ -71,7 +70,6 @@ struct Generator : public phi::Generator {
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
-    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
   }
 
   Generator(const Generator& other) = delete;
@@ -95,32 +93,21 @@ struct Generator : public phi::Generator {
 
   std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
 
-  void SetIsInitPy(bool);
-  bool GetIsInitPy() const;
   uint64_t get_device_id() { return this->state_.device; }
 
  private:
   phi::Generator::GeneratorState state_;
   std::shared_ptr<std::mt19937_64> engine_;
   mutable std::mutex mu_;
-
-  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
-  // old seed, and it should be removed after all random-related operators
-  // and unittests upgrades to use generator.
-  bool is_init_py_ = false;
 };
 
 // The DefaultCPUGenerator is used in manual_seed()
 const std::shared_ptr<Generator>& DefaultCPUGenerator();
 
-// If op seed is set or global is not set, the OpDefaultCPUEngine is used.
-std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id = -1);
 
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
-const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
-    int64_t device_id = -1);
-
 const std::shared_ptr<Generator>& SetRandomSeedGenerator(
     const std::string& name, uint64_t seed);
 
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index 1064c77cc0041..a23cf2815d8fe 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -416,14 +416,13 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                    1) *
                   vec_size;
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
+    auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
+    if (!fix_seed) {
       auto seed_offset = gen_cuda->IncrementOffset(offset);
       seed_data = seed_offset.first;
       increment = seed_offset.second;
     } else {
-      std::random_device rnd;
-      seed_data = fix_seed ? seed + rank : rnd();
+      seed_data = seed + rank;
       increment = offset;
     }
     RandomSampleClassCenter<T><<<NumBlocks(num_classes), kNumCUDAThreads, 0,
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 972dea38f5746..798fd93006620 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -172,17 +172,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int seed = ctx.Attr<int>("seed");
 
     if (!is_test) {
-      int device_id = ctx.GetPlace().GetDeviceId();
-      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-      if (gen_cuda->GetIsInitPy() && seed == 0) {
-        // If perform `manual_seed` in python and inner seed is not specified
-        // (equals 0), use global generator generated seed.
+      if (seed == 0) {
+        // If not specify seed, use global Generator to generate seed.
+        int device_id = ctx.GetPlace().GetDeviceId();
+        auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id);
         seed = static_cast<int>(gen_cuda->Random64());
-      } else if (seed == 0) {
-        // use random generated seed
-        std::random_device rd;
-        seed = rd();
-      }  // else use `ctx.Attr<int>("seed")` specified seed
+      }
+      // else use `ctx.Attr<int>("seed")` specified seed
     }
 
     bool has_seq_length = ctx.HasInput("SequenceLength");
diff --git a/paddle/fluid/operators/dirichlet_op.cu b/paddle/fluid/operators/dirichlet_op.cu
index 63f9c7339bfc5..ac6480a8fa1c6 100644
--- a/paddle/fluid/operators/dirichlet_op.cu
+++ b/paddle/fluid/operators/dirichlet_op.cu
@@ -77,7 +77,7 @@ struct DirichletSampler<platform::CUDADeviceContext, T> {
 
     // init state, seed & offset for all threads
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto p_gen = framework::GetDefaultCUDAGenerator(device_id);
+    auto p_gen = framework::DefaultCUDAGenerator(device_id);
     auto seed_and_offset = p_gen->IncrementOffset(10);  // hard-coded offset
     auto seed = seed_and_offset.first;
     auto offset = seed_and_offset.second;
diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h
index c62d45570ba29..571a1c97c52e8 100644
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
@@ -26,7 +26,7 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                     const int offset, uint64_t* seed_data,
                                     uint64_t* increment) {
   int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
 
   if (seed) {
     framework::Tensor seed_cpu_tensor;
@@ -34,13 +34,12 @@ inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx,
                                       &seed_cpu_tensor);
     *seed_data = static_cast<uint64_t>(seed_cpu_tensor.data<int>()[0]);
     *increment = offset;
-  } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) {
+  } else if (!is_fix_seed) {
     auto seed_offset = gen_cuda->IncrementOffset(offset);
     *seed_data = seed_offset.first;
     *increment = seed_offset.second;
   } else {
-    std::random_device rnd;
-    *seed_data = is_fix_seed ? seed_val : rnd();
+    *seed_data = seed_val;
     *increment = offset;
   }
 }
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 552649279e911..deac932d59b80 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -54,26 +54,21 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     int64_t size = tensor->numel();
 
     int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
     auto& dev_cxt =
         context.template device_context<platform::CUDADeviceContext>();
 
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (seed == 0) {
+      // use global Generator seed
       auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
-                                       seed_offset.second);
+      uint64_t seed = seed_offset.first;
+      uint64_t offset = seed_offset.second;
+      auto func = GaussianGenerator<T>(mean, std, seed, size * offset);
       phi::IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
       auto func = GaussianGenerator<T>(mean, std, seed);
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index ae846f4cae6fb..3e27402c86947 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -151,12 +151,6 @@ void UniformRandom(const framework::ExecutionContext& context,
   T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
   if (size <= 0) return;
   unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-  bool seed_flag = false;
-  if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
 
   T min = static_cast<T>(context.Attr<float>("min"));
   T max = static_cast<T>(context.Attr<float>("max"));
@@ -165,14 +159,15 @@ void UniformRandom(const framework::ExecutionContext& context,
   unsigned int diag_step =
       static_cast<unsigned int>(context.Attr<int>("diag_step"));
   T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-  int device_id = context.GetPlace().GetDeviceId();
-  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+
+  if (seed == 0) {
+    // Use global Generator seed
     using MT = typename details::MPTypeTrait<T>::Type;
     phi::funcs::uniform_distribution<MT> dist;
     phi::funcs::uniform_real_transform<MT> trans(min, max);
     phi::funcs::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
   } else {
+    // Use OP seed
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
     phi::IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 904e4854ba6b4..0bf5ca7f8f525 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -169,7 +169,7 @@ inline void EmplaceDeviceContext(
 
           cuda_ctx->PartialInitWithAllocator();
           dev_ctx->SetGenerator(
-              framework::GetDefaultCUDAGenerator(p.GetDeviceId()).get());
+              framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
 #endif
         } else {
           dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 53379373d2518..6bb85da8c466f 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -55,13 +55,9 @@ void BindGenerator(py::module* m_ptr) {
            })
       .def("seed", &framework::Generator::Seed)
       .def("initial_seed", &framework::Generator::GetCurrentSeed)
-      .def("random", &framework::Generator::Random64)
-      //  .def("get_cpu_engine", &framework::Generator::GetCPUEngine)
-      //  .def("set_cpu_engine", &framework::Generator::SetCPUEngine)
-      .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
-                    &framework::Generator::SetIsInitPy);
+      .def("random", &framework::Generator::Random64);
   m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+  m.def("default_cuda_generator", &framework::DefaultCUDAGenerator);
   m.def("set_random_seed_generator", &framework::SetRandomSeedGenerator);
   m.def("get_random_seed_generator", &framework::GetRandomSeedGenerator);
 }
diff --git a/paddle/phi/core/generator.h b/paddle/phi/core/generator.h
index 29ea92cbe6d94..3263b2a525732 100644
--- a/paddle/phi/core/generator.h
+++ b/paddle/phi/core/generator.h
@@ -49,12 +49,6 @@ class Generator {
   virtual std::pair<uint64_t, uint64_t> IncrementOffset(
       uint64_t increament_offset) = 0;
 
-  // NOTE(zhiqiu): is_init_py_ is used to make generator be compatible with
-  // old seed, and it should be removed after all random-related operators
-  // and unittests upgrades to use generator.
-  virtual void SetIsInitPy(bool) = 0;
-  virtual bool GetIsInitPy() const = 0;
-
   virtual uint64_t get_device_id() = 0;
 };
 
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
index 96ebc0353ef24..b80634357d62f 100644
--- a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -59,34 +59,20 @@ void GaussianRandomKernel(const Context& dev_ctx,
                           int seed,
                           DataType dtype,
                           DenseTensor* out) {
-  auto tensor = out;
-
-  bool seed_flag = false;
+  out->Resize(phi::make_ddim(shape.GetData()));
+  dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
-
-  tensor->Resize(phi::make_ddim(shape.GetData()));
-
-  T* data = dev_ctx.template Alloc<T>(tensor);
-
-  int64_t size = tensor->numel();
-
-  int device_id = dev_ctx.GetPlace().GetDeviceId();
-  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    // use global Generator seed
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     funcs::normal_distribution<MT> dist;
     funcs::normal_transform<MT> trans(static_cast<MT>(mean),
                                       static_cast<MT>(std));
-    funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
+    // use OP seed
     auto func =
         GaussianGenerator<T>(static_cast<T>(mean), static_cast<T>(std), seed);
-    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, tensor, func);
+    IndexKernel<T, GaussianGenerator<T>>(dev_ctx, out, func);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 6b1e58981baa0..c0e557f09bcc9 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -27,12 +27,9 @@
 namespace cub = hipcub;
 #endif
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
@@ -144,27 +141,21 @@ struct GumbleNoiseGenerator<GPUContext, T> {
     DenseTensor random_tensor;
     int64_t size = size_to_axis * size_from_axis;
     random_tensor.Resize(make_ddim({size}));
-    auto* random_data = ctx.template Alloc<T>(&random_tensor);
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    T* random_data = ctx.template Alloc<T>(&random_tensor);
 
     // generate gumbel noise
     int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy()) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin,
-          index_sequence_begin + size,
-          thrust::device_ptr<T>(random_data),
-          UniformCUDAGenerator<T>(0.00001, 1, seed_offset.first, gen_offset));
-    } else {
-      const unsigned int seed = std::random_device()();
-      thrust::transform(index_sequence_begin,
-                        index_sequence_begin + size,
-                        thrust::device_ptr<T>(random_data),
-                        UniformCUDAGenerator<T>(0.00001, 1, seed));
-    }
+    auto gen_cuda = ctx.GetGenerator();
+
+    auto seed_offset = gen_cuda->IncrementOffset(1);
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+
+    thrust::counting_iterator<int64_t> index_sequence_begin(0);
+    thrust::transform(index_sequence_begin,
+                      index_sequence_begin + size,
+                      thrust::device_ptr<T>(random_data),
+                      UniformCUDAGenerator<T>(0.00001, 1, seed, size * offset));
 
     // add gumbel noise to X
     const int thread_size = 512;
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index d30b7ec34d43c..f2ffe3c9d4fba 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -175,17 +175,13 @@ void RnnKernel(const Context &dev_ctx,
         mode));
 
   if (!is_test) {
-    int device_id = dev_ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed == 0) {
-      // If perform `manual_seed` in python and inner seed is not specified
-      // (equals 0), use global generator generated seed.
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = dev_ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id);
       seed = static_cast<int>(gen_cuda->Random64());
-    } else if (seed == 0) {
-      // use random generated seed
-      std::random_device rd;
-      seed = rd();
-    }  // else use `ctx.Attr<int>("seed")` specified seed
+    }
+    // else use `ctx.Attr<int>("seed")` specified seed
   }
 
   const T *x_data = x.data<T>();
diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
index 5b6ae9d09bff2..33ecb4d6eb544 100644
--- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu
@@ -90,34 +90,25 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
                                    int seed,
                                    DataType dtype,
                                    DenseTensor* out) {
-  auto tensor = out;
-
-  T* data = dev_ctx.template Alloc<T>(tensor);
-
-  bool seed_flag = false;
-  if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
+  T* data = dev_ctx.template Alloc<T>(out);
 
   thrust::counting_iterator<int64_t> index_sequence_begin(0);
-  int64_t size = tensor->numel();
+  int64_t size = out->numel();
 
   auto gen_cuda = dev_ctx.GetGenerator();
-
-  if (gen_cuda->GetIsInitPy() && seed_flag) {
+  if (seed == 0) {
+    // use global Generator seed
     auto seed_offset = gen_cuda->IncrementOffset(1);
-    int64_t gen_offset = size * seed_offset.second;
-    thrust::transform(index_sequence_begin,
-                      index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      TruncatedNormalOffset<T>(mean,
-                                               std,
-                                               std::numeric_limits<T>::min(),
-                                               seed_offset.first,
-                                               gen_offset));
+    uint64_t seed = seed_offset.first;
+    uint64_t offset = seed_offset.second;
+    thrust::transform(
+        index_sequence_begin,
+        index_sequence_begin + size,
+        thrust::device_ptr<T>(data),
+        TruncatedNormalOffset<T>(
+            mean, std, std::numeric_limits<T>::min(), seed, size * offset));
   } else {
+    // use OP seed
     thrust::transform(
         index_sequence_begin,
         index_sequence_begin + size,
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index a4aea10cfe762..68e61b7328971 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -65,22 +65,15 @@ void UniformRandomRawKernel(const Context& dev_ctx,
                             float diag_val,
                             DenseTensor* out) {
   out->Resize(phi::make_ddim(shape.GetData()));
-  T* data = dev_ctx.template Alloc<T>(out);
-  auto size = out->numel();
-  bool seed_flag = false;
+  dev_ctx.template Alloc<T>(out);
   if (seed == 0) {
-    std::random_device rd;
-    seed = rd();
-    seed_flag = true;
-  }
-
-  auto generator = dev_ctx.GetGenerator();
-  if (generator->GetIsInitPy() && seed_flag) {
+    // Use global Generator seed
     using MT = typename kps::details::MPTypeTrait<T>::Type;
     funcs::uniform_distribution<MT> dist;
     funcs::uniform_real_transform<MT> trans(min, max);
     funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
+    // Use OP seed
     auto func =
         UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
     IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 6033b809f218d..14a91b0c2c5fe 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -25,6 +25,8 @@
 import paddle.fluid.core as core
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Only test cuda Random Generator")
 class TestGeneratorSeed(unittest.TestCase):
     """
     Test cases for cpu generator seed.
@@ -70,15 +72,13 @@ def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
         fluid.enable_dygraph()
 
-        paddle.seed(12312321111)
-        x = fluid.layers.gaussian_random([120], dtype="float32")
-        st1 = paddle.get_cuda_rng_state()
-        x1 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.set_cuda_rng_state(st1)
-        x2 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.seed(12312321111)
-        x3 = fluid.layers.gaussian_random([120], dtype="float32")
-        x_np = x.numpy()
+        st = paddle.get_cuda_rng_state()
+        x1 = paddle.randn([120], dtype="float32")
+        paddle.set_cuda_rng_state(st)
+        x2 = paddle.randn([120], dtype="float32")
+        paddle.set_cuda_rng_state(st)
+        x3 = paddle.randn([120], dtype="float32")
+
         x1_np = x1.numpy()
         x2_np = x2.numpy()
         x3_np = x3.numpy()
@@ -86,7 +86,7 @@ def test_generator_gaussian_random_dygraph(self):
         if core.is_compiled_with_cuda():
             print(">>>>>>> gaussian random dygraph >>>>>>>")
             self.assertTrue(np.allclose(x1_np, x2_np))
-            self.assertTrue(np.allclose(x_np, x3_np))
+            self.assertTrue(np.allclose(x2_np, x3_np))
 
     def test_generator_randint_dygraph(self):
         """Test Generator seed."""
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index dacb7a5b59957..3621fd1b9d445 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -629,7 +629,6 @@ def _calc_output(self, place, mode="test", dygraph=True):
         else:
             fluid.disable_dygraph()
         gen = paddle.seed(self._random_seed)
-        gen._is_init_py = False
         paddle.framework.random._manual_program_seed(self._random_seed)
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
index 2ba808a341e5e..5f4989f6c5dbd 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -178,7 +178,6 @@ class TestUniformRandomOpAPISeed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
-        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 0b27c61623089..0bca3c08f3d78 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -370,7 +370,6 @@ class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
         gen = paddle.seed(_seed)
-        gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 147f6be39c5e0..b58d36b8e7d50 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -44,10 +44,8 @@ def seed(seed):
 
     if core.is_compiled_with_cuda():
         for i in range(core.get_cuda_device_count()):
-            core.default_cuda_generator(i)._is_init_py = True
             core.default_cuda_generator(i).manual_seed(seed)
 
-    core.default_cpu_generator()._is_init_py = True
     return core.default_cpu_generator().manual_seed(seed)
 
 

From c0001a2433c1058ebfd21df22fe0f86146f16610 Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Mon, 23 May 2022 11:49:19 +0800
Subject: [PATCH 006/109] Acc name (#42906)

add dymf support of gpups
---
 paddle/fluid/framework/fleet/heter_context.h  | 18 ----
 .../framework/fleet/heter_ps/feature_value.h  | 14 +++
 .../fleet/heter_ps/hashtable_kernel.cu        | 15 ++-
 .../framework/fleet/heter_ps/heter_comm_inl.h | 43 ++++++++-
 .../fleet/heter_ps/heter_comm_kernel.cu       |  1 +
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 94 +++++++------------
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 11 +++
 7 files changed, 114 insertions(+), 82 deletions(-)

diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 11217b6c485fc..823b60c5ef1f2 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -95,24 +95,6 @@ class HeterContext {
   }
   void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
   uint32_t ShardNum() { return shard_num_; }
-  void init(int shard_num, int device_num) {
-    shard_num_ = shard_num;
-    feature_keys_.resize(shard_num_);
-    value_ptr_.resize(shard_num_);
-    device_task_ptr_.resize(shard_num_);
-    device_task_keys_.resize(shard_num_);
-    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
-      device_task_ptr_[i].resize(device_num);
-      device_task_keys_[i].resize(device_num);
-    }
-
-    device_values_.resize(device_num);
-    device_keys_.resize(device_num);
-    mutex_.resize(device_num);
-    for (size_t i = 0; i < mutex_.size(); ++i) {
-      mutex_[i] = new std::mutex();
-    }
-  }
 
   void init(int shard_num, int device_num, int dim_num) {
     shard_num_ = shard_num;
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 682c4568cb7e1..cb7f3a40d6720 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -69,6 +69,20 @@ struct FeaturePushValue {
   int mf_dim;
   float mf_g[0];
 
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    // out.mf_g = a.mf_g;
+    for (int i = 0; i < out.mf_dim; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
+  }
   __device__ __forceinline__ void operator=(const FeaturePushValue& in) {
     show = in.show;
     clk = in.clk;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 32dbd98992b5d..f5807d2fd7eb7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -86,13 +86,26 @@ __global__ void dy_mf_search_kernel(Table* table,
                                     char* vals, size_t len,
                                     size_t pull_feature_value_size) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  // return;
   if (i < len) {
     auto it = table->find(keys[i]);
 
     if (it != table->end()) {
       uint64_t offset = i * pull_feature_value_size;
-      FeatureValue& cur = *(FeatureValue*)(vals + offset);
+      FeatureValue* cur = (FeatureValue*)(vals + offset);
       FeatureValue& input = *(FeatureValue*)(it->second);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 506a0c0b1863f..64b177abb8638 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -26,6 +26,7 @@ namespace framework {
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
+  VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
   storage_.resize(resource_->total_device());
   multi_mf_dim_ = resource->multi_mf();
@@ -364,6 +365,10 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
       delete table;
       table = nullptr;
     }
+    for (auto& table : tables_) {
+      delete table;
+      table = nullptr;
+    }
   }
 }
 
@@ -473,17 +478,23 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
     return;
   }
   int dev_id = resource_->dev_id(num);
+
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
+
+  // use hbm pool
   std::vector<memory::allocation::AllocationPtr> d_key_bufs;
+
   ppStream streams[stream_num];  // NOLINT
   for (int i = 0; i < stream_num; ++i) {
     create_stream(&(streams[i]));
     auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
     d_key_bufs.push_back(std::move(d_k_buf));
   }
+
   int cur_len = 0;
   int cur_stream = 0;
+
   while (cur_len < len) {
     cur_stream = cur_stream % stream_num;
     auto cur_use_stream = streams[cur_stream];
@@ -491,8 +502,10 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
     cur_use_stream = 0;
 #endif
     int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
+
     auto dst_place = place;
     auto src_place = platform::CPUPlace();
+
     memory_copy(
         dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
         src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
@@ -557,14 +570,20 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
   platform::CUDADeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(gpu_num, 0);
+
   size_t temp_storage_bytes;
+
+  // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_;
   size_t grad_value_size =
       TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
+
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
+
   auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
+
   auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
   uint32_t* d_fea_num_info_ptr =
       reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
@@ -836,9 +855,16 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
+
   GradType* d_shard_grads_ptr;
-  auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
-  d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  if (!multi_mf_dim_) {
+    auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  } else {
+    auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
+    d_shard_grads_ptr = reinterpret_cast<GradType*>(d_shard_grads->ptr());
+  }
+
   int uniq_len = len;
   dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
 
@@ -846,9 +872,16 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 
   split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
                        dev_num);
-  heter_comm_kernel_->dy_mf_fill_shard_grads(
-      d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, uniq_len,
-      grad_value_size, stream);
+
+  if (!multi_mf_dim_) {
+    heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
+                                         d_shard_grads_ptr, d_grads, d_idx_ptr,
+                                         uniq_len, stream);
+  } else {
+    heter_comm_kernel_->dy_mf_fill_shard_grads(
+        d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
+        uniq_len, grad_value_size, stream);
+  }
 
   sync_stream(stream);
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index f44803982a55a..94d7929b2947d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -136,6 +136,7 @@ __global__ void merge_gradients_kernel(const uint32_t* offset,
                                        size_t grad_value_size,
                                        DynamicGradMerger& merger_) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
   if (i < n) {
     uint32_t start = offset[i];
     uint32_t num = fea_num[i];
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index a22704bd1ed03..18eec174fe9ce 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -106,25 +106,17 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  if (!multi_mf_dim_) {
-    gpu_task->init(thread_keys_shard_num_, device_num);
-  } else {
-    gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
-  }
+  gpu_task->init(thread_keys_shard_num_, device_num, multi_mf_dim_);
 
   std::vector<std::thread> threads;
-  if (!multi_mf_dim_) {
-    thread_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_keys_[i].resize(thread_keys_shard_num_);
-    }
-  } else {
-    thread_dim_keys_.resize(thread_keys_thread_num_);
-    for (int i = 0; i < thread_keys_thread_num_; i++) {
-      thread_dim_keys_[i].resize(thread_keys_shard_num_);
-      for (int j = 0; j < thread_keys_shard_num_; j++) {
-        thread_dim_keys_[i][j].resize(multi_mf_dim_);
-      }
+
+  // data should be in input channel
+
+  thread_dim_keys_.resize(thread_keys_thread_num_);
+  for (int i = 0; i < thread_keys_thread_num_; i++) {
+    thread_dim_keys_[i].resize(thread_keys_shard_num_);
+    for (int j = 0; j < thread_keys_shard_num_; j++) {
+      thread_dim_keys_[i][j].resize(multi_mf_dim_);
     }
   }
 
@@ -144,18 +136,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     len_per_thread = total_len / thread_keys_thread_num_;
     remain = total_len % thread_keys_thread_num_;
     VLOG(0) << "total len: " << total_len;
-    auto gen_func = [this](const std::deque<SlotRecord>& total_data,
-                           int begin_index, int end_index, int i) {
-      for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
-        const auto& ins = *iter;
-        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
-        for (const auto feasign : feasign_v) {
-          int shard_id = feasign % thread_keys_shard_num_;
-          this->thread_keys_[i][shard_id].insert(feasign);
-        }
-      }
-    };
     auto gen_dynamic_mf_func = [this](const std::deque<SlotRecord>& total_data,
                                       int begin_index, int end_index, int i) {
       for (auto iter = total_data.begin() + begin_index;
@@ -177,17 +157,10 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       }
     };
     for (int i = 0; i < thread_keys_thread_num_; i++) {
-      if (!multi_mf_dim_) {
-        VLOG(0) << "yxf::psgpu wrapper genfunc";
-        threads.push_back(
-            std::thread(gen_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
-      } else {
-        VLOG(0) << "yxf::psgpu wrapper genfunc with dynamic mf";
-        threads.push_back(
-            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
-      }
+      threads.push_back(
+          std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+
       begin += len_per_thread + (i < remain ? 1 : 0);
     }
     for (std::thread& t : threads) {
@@ -235,12 +208,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
 
   threads.clear();
   // merge thread_keys to shard_keys
-  auto merge_ins_func = [this, gpu_task](int shard_num) {
-    for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
-      thread_keys_[i][shard_num].clear();
-    }
-  };
   auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
     for (int i = 0; i < thread_keys_thread_num_; ++i) {
       gpu_task->batch_add_keys(shard_num, dim_id,
@@ -249,12 +216,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     }
   };
   for (int i = 0; i < thread_keys_shard_num_; ++i) {
-    if (!multi_mf_dim_) {
-      threads.push_back(std::thread(merge_ins_func, i));
-    } else {
-      for (int j = 0; j < multi_mf_dim_; j++) {
-        threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
-      }
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads.push_back(std::thread(merge_ins_dynamic_mf_func, i, j));
     }
   }
   for (auto& t : threads) {
@@ -297,12 +260,12 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   auto& device_dim_keys = gpu_task->device_dim_keys_;
   auto& device_dim_ptr = gpu_task->device_dim_ptr_;
   auto& device_dim_mutex = gpu_task->dim_mutex_;
-  if (multi_mf_dim_) {
-    for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
-      device_dim_keys[dev].resize(multi_mf_dim_);
-      device_dim_ptr[dev].resize(multi_mf_dim_);
-    }
+
+  for (size_t dev = 0; dev < device_dim_keys.size(); dev++) {
+    device_dim_keys[dev].resize(multi_mf_dim_);
+    device_dim_ptr[dev].resize(multi_mf_dim_);
   }
+
   // auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads(thread_keys_shard_num_);
@@ -415,6 +378,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       task_keys[shard].push_back(local_dim_keys[i][j][k]);
       task_ptrs[shard].push_back(local_dim_ptr[i][j][k]);
     }
+    // allocate local keys to devices
     for (int dev = 0; dev < device_num; dev++) {
       device_dim_mutex[dev][j]->lock();
       int len = task_keys[dev].size();
@@ -619,6 +583,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
             << feature_keys_count[i];
     size_max = std::max(size_max, feature_keys_count[i]);
   }
+
   if (HeterPs_) {
     delete HeterPs_;
     HeterPs_ = nullptr;
@@ -665,6 +630,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
           ptr_val[paddle::ps::DownpourCtrDymfAccessor::
                       DownpourCtrDymfFeatureValue::embed_g2sum_index()];
       val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
       ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   mf_dim_index()] = float(mf_dim);
       val->mf_dim = mf_dim;
@@ -681,11 +648,15 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
         }
       }
     }
+
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
+
     this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
     auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+
     this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len,
                              feature_value_size, 500000, 2);
+
     if (device_dim_keys.size() > 0) {
       VLOG(0) << "show ptr table: " << i
               << " table kv size: " << device_dim_keys.size()
@@ -700,6 +671,7 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j);
     }
   }
+
   for (std::thread& t : threads) {
     t.join();
   }
@@ -723,7 +695,9 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
   InitSlotInfo();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
   gpu_task->Reset();
+
   data_ready_channel_->Put(gpu_task);
+
   VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]";
 }
 
@@ -805,6 +779,7 @@ void PSGPUWrapper::EndPass() {
   timer.Start();
   size_t keysize_max = 0;
   // in case of feasign_num = 0, skip dump_to_cpu
+
   for (size_t i = 0; i < heter_devices_.size(); i++) {
     for (int j = 0; j < multi_mf_dim_; j++) {
       keysize_max =
@@ -821,9 +796,11 @@ void PSGPUWrapper::EndPass() {
     VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
     size_t feature_value_size =
         TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
+
     char* test_build_values = (char*)malloc(feature_value_size * len);
     cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
                cudaMemcpyDeviceToHost);
+
     CHECK(len == hbm_pool->capacity());
 #ifdef PADDLE_WITH_PSLIB
     uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
@@ -972,7 +949,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
 
   feature_value_size = TYPEALIGN(
       8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1));
-  VLOG(0) << "yxf pull sparse feature_value_size: " << feature_value_size;
 
 #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begine Gpu Ps PullSparse";
@@ -1159,6 +1135,8 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
   }
   all_timer.Pause();
+  time_3 += all_timer.ElapsedSec();
+  time_4 += push_gpups_timer.ElapsedSec();
   VLOG(3) << "PushSparseGrad total cost: " << all_timer.ElapsedSec()
           << " s, of which GPUPS cost: " << push_gpups_timer.ElapsedSec()
           << " s";
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 9b55626645942..0efec57e59db6 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -333,6 +333,11 @@ class PSGPUWrapper {
 
   void SetSlotOffsetVector(const std::vector<int>& slot_offset_vector) {
     slot_offset_vector_ = slot_offset_vector;
+    std::cout << "yxf set: ";
+    for (auto s : slot_offset_vector_) {
+      std::cout << s << " | ";
+    }
+    std::cout << " end " << std::endl;
   }
 
 #ifdef PADDLE_WITH_CUDA
@@ -431,6 +436,12 @@ class PSGPUWrapper {
   int max_mf_dim_{0};
   size_t val_type_size_{0};
   size_t grad_type_size_{0};
+
+  double time_1 = 0.0;
+  double time_2 = 0.0;
+  double time_3 = 0.0;
+  double time_4 = 0.0;
+
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;

From 65f705e1011f63c349813d7368d55b35df03ad82 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Mon, 23 May 2022 12:06:39 +0800
Subject: [PATCH 007/109] [Eager] Support sharding_parallel under eager
 (#42910)

---
 .../fleet/utils/hybrid_parallel_util.py       | 53 +++++++++++--------
 ...test_parallel_dygraph_sharding_parallel.py |  3 +-
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index 1285e1f3323ff..d0b5c915e11cd 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -162,29 +162,36 @@ def sharding_reduce_gradients(parameter_list, hcg):
         sharding_nrank = hcg.get_sharding_parallel_group().nranks
         for param in parameter_list:
             if param.trainable and (param._grad_ivar() is not None):
-
-                g_var = param._grad_ivar()
-
-                # need use trace_op to allreduce 
-                # paddle.distributed.all_reduce(
-                #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="c_allreduce_sum",
-                    inputs={'X': g_var},
-                    outputs={'Out': g_var},
-                    attrs={
-                        'ring_id': hcg.get_sharding_parallel_group().id,
-                        'use_calc_stream': True
-                    })
-
-                # grad / sharding_rank
-                div_factor = paddle.to_tensor(sharding_nrank, dtype=g_var.dtype)
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="elementwise_div",
-                    inputs={'X': g_var,
-                            'Y': div_factor},
-                    outputs={'Out': g_var},
-                    attrs={'axis': -1})
+                if in_dygraph_mode():
+                    param.grad.scale_(1.0 / sharding_nrank)
+                    paddle.distributed.all_reduce(
+                        param.grad,
+                        group=hcg.get_sharding_parallel_group(),
+                        use_calc_stream=True)
+
+                elif _in_legacy_dygraph():
+                    g_var = param._grad_ivar()
+                    # need use trace_op to allreduce 
+                    # paddle.distributed.all_reduce(
+                    #     g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="c_allreduce_sum",
+                        inputs={'X': g_var},
+                        outputs={'Out': g_var},
+                        attrs={
+                            'ring_id': hcg.get_sharding_parallel_group().id,
+                            'use_calc_stream': True
+                        })
+
+                    # grad / sharding_rank
+                    div_factor = paddle.to_tensor(
+                        sharding_nrank, dtype=g_var.dtype)
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="elementwise_div",
+                        inputs={'X': g_var,
+                                'Y': div_factor},
+                        outputs={'Out': g_var},
+                        attrs={'axis': -1})
 
 
 def broadcast_sharding_parameters(model, hcg):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
index e12d1826f286c..503bd9d0f9797 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py
@@ -25,8 +25,7 @@ class TestHybridParallel(TestMultipleGpus):
 
     # check sharding logic as well as the accuracy with single mode
     def test_hybrid_parallel_sharding_logic(self):
-        # self.run_mnist_2gpu(
-        #     'hybrid_parallel_sharding_model.py')
+        self.run_mnist_2gpu('hybrid_parallel_sharding_model.py')
         self.run_mnist_2gpu(
             'hybrid_parallel_sharding_model.py', eager_mode=False)
 

From 9aed83272c369fb77a24606693cbb8a17d2baaeb Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Mon, 23 May 2022 12:56:25 +0800
Subject: [PATCH 008/109] Reduce test case for test_tensordot (#42885)

* Reduce test case for test_tensordot

* Fix CI errors
---
 .../fluid/tests/unittests/CMakeLists.txt      |  4 +-
 .../fluid/tests/unittests/test_tensordot.py   | 99 +++++++------------
 2 files changed, 38 insertions(+), 65 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6e80e142c4b85..d6d76e0437061 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -185,6 +185,8 @@ endif()
 # Temporally disable test_deprecated_decorator
 LIST(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
 
+LIST(REMOVE_ITEM TEST_OPS test_tensordot)
+
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
     LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
@@ -1036,7 +1038,7 @@ set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIME
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
-set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
+#set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index 9ac016511c20d..04b140cba4c0e 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -89,65 +89,6 @@ def set_input_data(self):
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.y = np.random.random(self.y_shape).astype(self.dtype)
 
-    def set_test_axes(self):
-        self.all_axes = []
-        axial_index = range(4)
-        all_permutations = list(it.permutations(axial_index, 0)) + list(
-            it.permutations(axial_index, 1)) + list(
-                it.permutations(axial_index, 2)) + list(
-                    it.permutations(axial_index, 3)) + list(
-                        it.permutations(axial_index, 4))
-        self.all_axes.extend(list(i) for i in all_permutations)
-
-        for axes_x in all_permutations:
-            for axes_y in all_permutations:
-                if len(axes_x) < len(axes_y):
-                    supplementary_axes_x = axes_x + axes_y[len(axes_x):]
-                    if any(
-                            supplementary_axes_x.count(i) > 1
-                            for i in supplementary_axes_x):
-                        continue
-                elif len(axes_y) < len(axes_x):
-                    supplementary_axes_y = axes_y + axes_x[len(axes_y):]
-                    if any(
-                            supplementary_axes_y.count(i) > 1
-                            for i in supplementary_axes_y):
-                        continue
-                self.all_axes.append([list(axes_x), list(axes_y)])
-
-        self.all_axes.extend(range(5))
-
-    def test_dygraph(self):
-        paddle.disable_static()
-        for axes in self.all_axes:
-            for place in self.places:
-                x = paddle.to_tensor(self.x, place=place)
-                y = paddle.to_tensor(self.y, place=place)
-                paddle_res = paddle.tensordot(x, y, axes)
-                np_res = tensordot_np(self.x, self.y, axes)
-                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
-
-    def test_static(self):
-        paddle.enable_static()
-        for axes in self.all_axes:
-            for place in self.places:
-                with paddle.static.program_guard(paddle.static.Program(),
-                                                 paddle.static.Program()):
-                    x = paddle.static.data(
-                        name='x', shape=self.x_shape, dtype=self.dtype)
-                    y = paddle.static.data(
-                        name='y', shape=self.y_shape, dtype=self.dtype)
-                    z = paddle.tensordot(x, y, axes)
-                    exe = paddle.static.Executor(place)
-                    paddle_res = exe.run(feed={'x': self.x,
-                                               'y': self.y},
-                                         fetch_list=[z])
-                    np_res = tensordot_np(self.x, self.y, axes)
-                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
-
-
-class TestTensordotAPIFloat64(TestTensordotAPI):
-    # Only test a small part of axes case for Float64 type
     def set_test_axes(self):
         self.all_axes = [
             [[3, 2], [3]], [[2, 1, 0], [2, 1]], [[1, 2, 0], [1, 3, 2]], [3, 0],
@@ -194,35 +135,65 @@ def set_test_axes(self):
             [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]
         ]
 
+    def test_dygraph(self):
+        paddle.disable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def test_static(self):
+        paddle.enable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                with paddle.static.program_guard(paddle.static.Program(),
+                                                 paddle.static.Program()):
+                    x = paddle.static.data(
+                        name='x', shape=self.x_shape, dtype=self.dtype)
+                    y = paddle.static.data(
+                        name='y', shape=self.y_shape, dtype=self.dtype)
+                    z = paddle.tensordot(x, y, axes)
+                    exe = paddle.static.Executor(place)
+                    paddle_res = exe.run(feed={'x': self.x,
+                                               'y': self.y},
+                                         fetch_list=[z])
+                    np_res = tensordot_np(self.x, self.y, axes)
+                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
+
+
+class TestTensordotAPIFloat64(TestTensordotAPI):
     def set_dtype(self):
         self.dtype = np.float64
 
 
-class TestTensordotAPIBroadcastCase1(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [1, 1, 1, 5]
         self.y_shape = [1, 5, 1, 1]
 
 
-class TestTensordotAPIBroadcastCase2(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [1, 5, 5, 5]
         self.y_shape = [1, 1, 1, 5]
 
 
-class TestTensordotAPIBroadcastCase3(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [5, 5, 1, 5]
 
 
-class TestTensordotAPIBroadcastCase4(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [5, 5, 5, 1]
         self.y_shape = [1, 1, 1, 1]
 
 
-class TestTensordotAPIBroadcastCase5(TestTensordotAPIFloat64):
+class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [1, 1, 5, 5]
         self.y_shape = [5, 5, 1, 5]

From fa6b3c9a47c55b6bff5923c3e956e0b1cf3ab732 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Mon, 23 May 2022 14:19:20 +0800
Subject: [PATCH 009/109] [Phi] Remove Storage (#42872)

* remove storage

* add glog include

* add glog include

* add glog include
---
 paddle/fluid/pybind/eager.cc                  |   1 -
 paddle/fluid/pybind/eager_functions.cc        |   1 -
 paddle/infrt/tensor/phi/tensor_map.cc         |   2 +
 paddle/phi/api/lib/api_custom_impl.cc         |   1 -
 paddle/phi/api/lib/api_gen_utils.h            |   1 -
 paddle/phi/api/lib/op_meta_info.cc            |   1 +
 paddle/phi/api/lib/sparse_api_custom_impl.cc  |   1 -
 paddle/phi/api/lib/tensor_copy.cc             |   1 -
 paddle/phi/api/lib/utils/CMakeLists.txt       |   2 +-
 paddle/phi/api/lib/utils/allocator.h          |   1 -
 paddle/phi/api/lib/utils/storage.cc           |  40 ------
 paddle/phi/api/lib/utils/storage.h            |  80 -----------
 paddle/phi/api/lib/utils/tensor_utils.h       |   1 -
 paddle/phi/core/dense_tensor.h                |   1 -
 paddle/phi/core/dense_tensor.inl              |  12 --
 paddle/phi/core/dense_tensor_impl.cc          |  10 +-
 paddle/phi/core/meta_tensor.h                 |   2 +
 paddle/phi/core/storage.cc                    |  25 ----
 paddle/phi/core/storage.h                     | 132 ------------------
 paddle/phi/core/string_tensor.cc              |   2 +-
 paddle/phi/core/string_tensor.h               |   1 -
 paddle/phi/kernels/cpu/allclose_kernel.cc     |   2 +-
 paddle/phi/kernels/cpu/reduce.h               |   1 -
 .../phi/kernels/cpu/uniform_random_kernel.cc  |   1 -
 .../phi/kernels/sparse/convolution_kernel.h   |   1 -
 paddle/phi/kernels/sparse/copy_kernel.h       |   1 -
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |   2 +
 .../phi/kernels/sparse/sparse_utils_kernel.h  |   1 -
 .../strings/cpu/strings_copy_kernel.cc        |   2 +
 .../kernels/strings/strings_empty_kernel.h    |   1 -
 .../strings/strings_lower_upper_kernel.h      |   1 -
 paddle/phi/tests/api/scale_api.h              |   1 -
 paddle/phi/tests/core/test_custom_kernel.cc   |   1 -
 paddle/phi/tests/core/test_dense_tensor.cc    |   1 +
 .../phi/tests/core/test_sparse_coo_tensor.cc  |   1 +
 python/paddle/utils/code_gen/api_gen.py       |   1 -
 .../paddle/utils/code_gen/backward_api_gen.py |   1 -
 .../utils/code_gen/intermediate_api_gen.py    |   1 -
 38 files changed, 16 insertions(+), 322 deletions(-)
 delete mode 100644 paddle/phi/api/lib/utils/storage.cc
 delete mode 100644 paddle/phi/api/lib/utils/storage.h
 delete mode 100644 paddle/phi/core/storage.cc
 delete mode 100644 paddle/phi/core/storage.h

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 530cc6992d391..c1b26ee0b792d 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/pybind/eager_op_function_impl.h"
 #include "paddle/fluid/pybind/tensor_py.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/core/string_tensor.h"
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 5395b4f31c83b..628e808ef99ac 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -45,7 +45,6 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/infrt/tensor/phi/tensor_map.cc b/paddle/infrt/tensor/phi/tensor_map.cc
index 7690322aed4a3..afac7175caf4f 100644
--- a/paddle/infrt/tensor/phi/tensor_map.cc
+++ b/paddle/infrt/tensor/phi/tensor_map.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/infrt/tensor/phi/tensor_map.h"
+
+#include "glog/logging.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace infrt {
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index d80444e7f710c..8a845c331cc60 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/tensor_copy.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 7303e6b46114d..097178ae0d928 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/meta_tensor.h"
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index 048e4f2b428f2..8d64246bdb69f 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "glog/logging.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
index c88e2e367feed..71ba8eaae2d36 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <memory>
 #include "glog/logging.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 57e3c28d8cb1f..85de3601fd96a 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/api/lib/tensor_copy.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 5689b2d43a4f2..0e1cd0cb83fd4 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
+cc_library(phi_api_utils SRCS tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor int_array scalar)
diff --git a/paddle/phi/api/lib/utils/allocator.h b/paddle/phi/api/lib/utils/allocator.h
index 84a089e5899ec..96f1294102ae1 100644
--- a/paddle/phi/api/lib/utils/allocator.h
+++ b/paddle/phi/api/lib/utils/allocator.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/phi/api/lib/utils/storage.cc b/paddle/phi/api/lib/utils/storage.cc
deleted file mode 100644
index 09ff18d10e312..0000000000000
--- a/paddle/phi/api/lib/utils/storage.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/lib/utils/storage.h"
-
-namespace paddle {
-namespace experimental {
-
-ExternalStorage::ExternalStorage(void* ptr,
-                                 size_t size,
-                                 const phi::Place& place)
-    : phi::Storage(std::make_shared<phi::Allocation>(ptr, size, place)),
-      size_(size) {}
-
-ExternalStorage::ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
-                                 size_t delta,
-                                 size_t size)
-    : Storage(std::make_shared<phi::Allocation>(
-          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
-      size_(size) {
-  PADDLE_ENFORCE_LE(
-      static_cast<size_t>(delta + size),
-      root->size(),
-      phi::errors::InvalidArgument("The size of the external storage does "
-                                   "not meet the metadata requirements."));
-}
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/storage.h b/paddle/phi/api/lib/utils/storage.h
deleted file mode 100644
index 5fe17bc51b68a..0000000000000
--- a/paddle/phi/api/lib/utils/storage.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/phi/core/storage.h"
-
-namespace paddle {
-namespace experimental {
-
-class ExternalStorage : public phi::Storage {
- public:
-  ExternalStorage(void* ptr, size_t size, const phi::Place& place);
-  ExternalStorage(const phi::intrusive_ptr<phi::Storage>& root,
-                  size_t delta,
-                  size_t size);
-
-  static const char* name() { return "ExternalStorage"; }
-
-  void Realloc(size_t n) override {
-    PADDLE_THROW(phi::errors::Unavailable(
-        "The external shared storage cannot be reallocated."));
-  }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void set_data_shared(
-      const std::shared_ptr<paddle::memory::Allocation>& holder) override {
-    CHECK(holder);
-    data_ = holder;
-    size_ = holder->size();
-  }
-
-  std::shared_ptr<paddle::memory::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    return std::move(data_);
-  }
-
-  size_t size() const noexcept override { return size_; }
-  const phi::Place& place() const override {
-    PADDLE_ENFORCE_NOT_NULL(
-        data_,
-        phi::errors::Unavailable(
-            "Unable to visit place as data_ has not been initialized yet."));
-    return data_->place();
-  }
-  bool OwnsMemory() const noexcept override { return false; }
-
- private:
-  int64_t size_{0};
-};
-
-class TensorStorage : public paddle::memory::allocation::Allocation {
- public:
-  explicit TensorStorage(phi::intrusive_ptr<phi::Storage> storage)
-      : paddle::memory::allocation::Allocation(
-            storage->data(), storage->size(), storage->place()),
-        storage_(std::move(storage)) {}
-
- private:
-  phi::intrusive_ptr<phi::Storage> storage_;
-};
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 00199da1280e8..36a0901bbe980 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/compat/convert_utils.h"
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 9861bd68e4a9e..06d3e435bc110 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 #include "paddle/phi/core/stream.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index 93513067a268b..01c19e8a55fdf 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -26,18 +26,6 @@ public:
 */
 explicit DenseTensor(paddle::experimental::DataType dtype);
 
-/// \brief Use existing storage space to create dense tensor. This interface
-/// can be used to deliberately create an uninitialized dense tensor.
-/// \param storage The existing storage.
-/// \param meta The meta data of dense tensor.
-DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
-
-/// \brief Use existing storage space to create dense tensor. This interface
-/// can be used to deliberately create an uninitialized dense tensor.
-/// \param storage The existing storage.
-/// \param meta The meta data of dense tensor.
-DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
-
 inline bool IsInitialized() const { return holder_ != nullptr; }
 
 template <typename T>
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 3c030cac2e7c9..8c97b6bf223fb 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -18,9 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
+#include "paddle/fluid/memory/malloc.h"
+
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_utils.h"
 #endif
@@ -211,13 +212,6 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::phi::dtype::complex<double>)
 /*   From framework::LoDTensor    */
 /* ------------------------------ */
 
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
-                         const DenseTensorMeta& meta)
-    : meta_(meta), holder_(storage->move_data_shared()) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)), holder_(storage->move_data_shared()) {}
-
 DenseTensor::DenseTensor(const LoD& lod) : DenseTensor() { meta_.lod = lod; }
 
 void DenseTensor::set_lod(const LoD& lod) { meta_.lod = lod; }
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 3cdbfda61d69c..29afe0d0292d1 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 
 // TODO(chenweihang): add other flags if needed
diff --git a/paddle/phi/core/storage.cc b/paddle/phi/core/storage.cc
deleted file mode 100644
index 0ddf5084464cc..0000000000000
--- a/paddle/phi/core/storage.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/storage.h"
-
-namespace phi {
-
-void TensorStorage::Realloc(size_t size) {
-  this->Clear();
-  data_ = alloc_->Allocate(size);
-  size_ = size;
-}
-
-}  // namespace phi
diff --git a/paddle/phi/core/storage.h b/paddle/phi/core/storage.h
deleted file mode 100644
index 24dc2c4a4f90b..0000000000000
--- a/paddle/phi/core/storage.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#include "boost/intrusive_ptr.hpp"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/utils/intrusive_ptr.h"
-#include "paddle/phi/core/utils/intrusive_ref_counter.h"
-#include "paddle/phi/core/utils/type_info.h"
-
-namespace phi {
-
-/// \brief The interface of contiguous storage used for the dense tensor.
-/// It should be used in conjunction with the intrusive pointer. We prohibit
-/// all default copy operations to ensure the integrity of the package.
-class Storage : public intrusive_ref_counter<Storage> {
- public:
-  Storage() = default;
-  Storage(const Storage&) = delete;
-
-  /* @jim19930609: Following interfaces will be modified/replaced/removed
-                   as soon as the new Allocation - Allocator design get
-     finalized.
-    */
-
-  /*   --------- shared_ptr<Allocation> -------- */
-  // Initialize a Storage with unique Allocation
-  explicit Storage(std::shared_ptr<phi::Allocation>&& data)
-      : data_(std::move(data)) {}
-
-  // Initialize a Storage shareing Allocation with another storage
-  explicit Storage(const std::shared_ptr<phi::Allocation>& data)
-      : data_(data) {}
-
-  void* data() const {
-    return data_ ? reinterpret_cast<void*>(
-                       reinterpret_cast<uintptr_t>(data_->ptr()))
-                 : nullptr;
-  }
-
-  const std::shared_ptr<phi::Allocation>& data_shared() const { return data_; }
-
-  virtual void set_data_shared(
-      const std::shared_ptr<phi::Allocation>& holder) = 0;
-
-  virtual std::shared_ptr<phi::Allocation>&& move_data_shared() = 0;
-
-  virtual void ReallocShared(size_t n) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "ReallocShared has not been overrided by the current Storage"));
-  }
-  /* --------- shared_ptr<Allocation> -------- */
-
-  virtual ~Storage() = default;
-
-  virtual void Clear() = 0;
-
-  virtual size_t size() const = 0;
-  virtual const Place& place() const = 0;
-  virtual bool OwnsMemory() const = 0;
-  virtual void Realloc(size_t n) = 0;
-
- protected:
-  std::shared_ptr<phi::Allocation> data_;
-};
-
-class TensorStorage : public Storage {
- public:
-  explicit TensorStorage(Allocator* a) : alloc_(a) {}
-
-  TensorStorage(Allocator* a, size_t size)
-      : Storage(a->Allocate(size)), alloc_(a) {
-    size_ = data_->size();
-  }
-
-  void Clear() override {
-    data_ = nullptr;
-    size_ = 0;
-  }
-
-  void Realloc(size_t size) override;
-
-  ~TensorStorage() = default;
-
-  static const char* name() { return "TensorStorage"; }
-
-  size_t size() const noexcept override { return size_; }
-
-  const Place& place() const override {
-    if (!data_) {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Unable to visit place: either data_ or alloc_ has to be initialized "
-          "first."));
-    }
-    return data_->place();
-  }
-
-  bool OwnsMemory() const noexcept override { return true; }
-
-  void set_data_shared(
-      const std::shared_ptr<phi::Allocation>& holder) override {
-    CHECK(holder);
-    data_ = holder;
-    size_ = holder->size();
-  }
-
-  std::shared_ptr<phi::Allocation>&& move_data_shared() override {
-    size_ = 0;
-    return std::move(data_);
-  }
-
- private:
-  Allocator* alloc_;
-  int64_t size_{0};
-};
-
-}  // namespace phi
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 35444dc33fe78..0a4e0d6191510 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/string_tensor.h"
-#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/fluid/memory/malloc.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/string_tensor.h b/paddle/phi/core/string_tensor.h
index 916c2a2bd4a4e..94c9974f4ad74 100644
--- a/paddle/phi/core/string_tensor.h
+++ b/paddle/phi/core/string_tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/storage.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc
index 80dea561956cf..f95ddc5621e9a 100644
--- a/paddle/phi/kernels/cpu/allclose_kernel.cc
+++ b/paddle/phi/kernels/cpu/allclose_kernel.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/allclose_kernel.h"
 
 #include <cmath>
-
+#include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index b0e43b6526cdd..35395dccca1af 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -20,7 +20,6 @@
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
index 91a6903418230..c95a8f4ded6dc 100644
--- a/paddle/phi/kernels/cpu/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -54,7 +54,6 @@ void UniformRandomRawKernel(const Context &dev_ctx,
                             float diag_val,
                             DenseTensor *out) {
   out->Resize(phi::make_ddim(shape.GetData()));
-  VLOG(4) << out->dims();
   T *data = dev_ctx.template Alloc<T>(out);
   auto size = out->numel();
   std::shared_ptr<std::mt19937_64> engine;
diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h
index 6120d6339a7eb..62a72a9dd4115 100644
--- a/paddle/phi/kernels/sparse/convolution_kernel.h
+++ b/paddle/phi/kernels/sparse/convolution_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/sparse/copy_kernel.h b/paddle/phi/kernels/sparse/copy_kernel.h
index a43621a4dfeed..70e2aaef8a888 100644
--- a/paddle/phi/kernels/sparse/copy_kernel.h
+++ b/paddle/phi/kernels/sparse/copy_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index ff2647de731d7..b208e70e04046 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -22,6 +22,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace phi {
 namespace sparse {
 
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index d39790fcea5e3..93abf70b24412 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index 156cea63f171c..41889f9cc5ed7 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/strings/strings_copy_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace strings {
 
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.h b/paddle/phi/kernels/strings/strings_empty_kernel.h
index 1add1963614d8..8a014f2a78c2c 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.h
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/infermeta/strings/nullary.h"
diff --git a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
index 97f530164528a..db6c267a8586d 100644
--- a/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
+++ b/paddle/phi/kernels/strings/strings_lower_upper_kernel.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/infermeta/strings/unary.h"
 #include "paddle/phi/kernels/strings/case_utils.h"
diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h
index 0217ba23b2274..16143fb11e0ff 100644
--- a/paddle/phi/tests/api/scale_api.h
+++ b/paddle/phi/tests/api/scale_api.h
@@ -19,7 +19,6 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index 634edaec96d29..abd77e2862410 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_context.h"
diff --git a/paddle/phi/tests/core/test_dense_tensor.cc b/paddle/phi/tests/core/test_dense_tensor.cc
index ddfa184df2c1e..42814317b9c83 100644
--- a/paddle/phi/tests/core/test_dense_tensor.cc
+++ b/paddle/phi/tests/core/test_dense_tensor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
 
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index 5d0e16b0528e7..5e7642bbfdcb0 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/tests/core/allocator.h"
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 0de60c14d3a42..4e98985c9b111 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -195,7 +195,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 502c221952fb4..886748eeb290e 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -209,7 +209,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/infermeta/backward.h"
diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/python/paddle/utils/code_gen/intermediate_api_gen.py
index 2df3ac643614e..4e4875b596192 100644
--- a/python/paddle/utils/code_gen/intermediate_api_gen.py
+++ b/python/paddle/utils/code_gen/intermediate_api_gen.py
@@ -44,7 +44,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/sparse_api_custom_impl.h"
-#include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"

From 2cb61405abcab502c07be750151ed0773175094e Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Mon, 23 May 2022 14:23:25 +0800
Subject: [PATCH 010/109] add is_train into the cache key (#42889)

* add is_train into the cache key

* fix unittest error

* add unittest

* remove import
---
 .../dygraph_to_static/program_translator.py   | 27 ++++++---
 .../dygraph_to_static/test_drop_path.py       | 55 +++++++++++++++++++
 .../dygraph_to_static/test_partial_program.py | 15 ++---
 3 files changed, 83 insertions(+), 14 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index b860740f71b25..2efb6965085de 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -197,10 +197,12 @@ def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
     def __hash__(self):
         error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
         with_hook = self.kwargs.get("with_hook", False)
-        return hash((id(self.function_spec),
-                     make_hashable(self.input_args_with_spec, error_msg),
-                     make_hashable(self.input_kwargs_with_spec, error_msg),
-                     self._spec_names_id, self.class_instance, with_hook))
+        is_train = self.kwargs.get("is_train", False)
+        return hash(
+            (id(self.function_spec),
+             make_hashable(self.input_args_with_spec, error_msg),
+             make_hashable(self.input_kwargs_with_spec, error_msg),
+             self._spec_names_id, self.class_instance, with_hook, is_train))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -357,7 +359,7 @@ def __call__(self, *args, **kwargs):
 
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
-                *args, **kwargs)
+                *args, **kwargs, is_train=self._is_train_mode())
 
             # 3. synchronize self.training attribute.
             if isinstance(self._class_instance, layers.Layer):
@@ -383,6 +385,12 @@ def __call__(self, *args, **kwargs):
                     " if you can't handle this {} yourself.".format(type(e)))
                 raise e
 
+    def _is_train_mode(self):
+        if self._class_instance is not None:
+            return self._class_instance.training
+        else:
+            return self._training
+
     def _call_dygraph_function(self, *args, **kwargs):
         """
         Calls dygraph function directly and returns the outputs.
@@ -415,6 +423,8 @@ def get_concrete_program(self, *args, **kwargs):
         """
 
         with_hook = kwargs.get("with_hook", False)
+        is_train = kwargs.get("is_train", True)
+        if "is_train" in kwargs: kwargs.pop("is_train")
         if "with_hook" in kwargs: kwargs.pop("with_hook")
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
@@ -430,7 +440,8 @@ def get_concrete_program(self, *args, **kwargs):
             input_kwargs_with_spec,
             self._class_instance,
             **self._kwargs,
-            with_hook=with_hook)
+            with_hook=with_hook,
+            is_train=is_train)
 
         # 3. check whether hit the cache or build a new program for the input arguments
         concrete_program, partial_program_layer = self._program_cache[cache_key]
@@ -525,7 +536,9 @@ def concrete_program_specify_input_spec(self,
             has_input_spec = (desired_input_spec is not None)
             if has_input_spec:
                 concrete_program, _ = self.get_concrete_program(
-                    *desired_input_spec, with_hook=with_hook)
+                    *desired_input_spec,
+                    with_hook=with_hook,
+                    is_train=self._is_train_mode())
                 return concrete_program
             else:
                 raise ValueError(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
new file mode 100644
index 0000000000000..7383c834ba9a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_drop_path.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+
+
+def drop_path(x, training=False):
+    if not training:
+        return x
+    else:
+        return 2 * x
+
+
+class DropPath(paddle.nn.Layer):
+    def __init__(self):
+        super(DropPath, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return drop_path(x, self.training)
+
+
+class TestTrainEval(unittest.TestCase):
+    def setUp(self):
+        self.model = DropPath()
+
+    def tearDown(self):
+        pass
+
+    def test_train_and_eval(self):
+        x = paddle.to_tensor([1, 2, 3]).astype("int64")
+        eval_out = x.numpy()
+        train_out = x.numpy() * 2
+        self.model.train()
+        self.assertTrue(np.allclose(self.model(x).numpy(), train_out))
+        self.model.eval()
+        self.assertTrue(np.allclose(self.model(x).numpy(), eval_out))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index 427e4c2252451..4f55dbd324c21 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -135,22 +135,23 @@ def test_switch_eval_and_train(self):
             x = fluid.dygraph.to_variable(x_data)
             linear_net(x)
 
-            _, partial_layer = linear_net.forward.program_cache.last()[-1]
+            _, train_partial_layer = linear_net.forward.program_cache.last()[-1]
             # check default mode is for training
-            self.assertEqual(partial_layer.program,
-                             partial_layer._train_program)
+            self.assertEqual(train_partial_layer.program,
+                             train_partial_layer._train_program)
 
             # switch to run test program after `eval()`
             linear_net.eval()
             linear_net(x)
-            self.assertEqual(partial_layer.program,
-                             partial_layer._infer_program)
+            _, eval_partial_layer = linear_net.forward.program_cache.last()[-1]
+            self.assertEqual(eval_partial_layer.program,
+                             eval_partial_layer._infer_program)
 
             # switch back into training
             linear_net.train()
             linear_net(x)
-            self.assertEqual(partial_layer.program,
-                             partial_layer._train_program)
+            self.assertEqual(train_partial_layer.program,
+                             train_partial_layer._train_program)
 
 
 class TestWithNoGrad(unittest.TestCase):

From e5ebd347af93c698fead20d1f09aa577f89263e5 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 23 May 2022 15:29:38 +0800
Subject: [PATCH 011/109] support backward inplace for eager dygraph mode
 (#42795)

* support inplace in backward

* fix final_state_linear

* fix format of backward_inplace_map

* little change

* add subtract in yaml

* fix hook mem leak

* fix hook use_count

* little format change

* fix

Co-authored-by: JiabinYang <360788950@qq.com>
---
 .../final_state_generator/codegen_utils.py    | 34 +++++----
 .../final_state_generator/eager_gen.py        | 69 ++++++++++++++++---
 .../final_state_generator/python_c_gen.py     | 16 ++---
 paddle/fluid/eager/tensor_wrapper.h           |  4 ++
 paddle/fluid/eager/utils.cc                   | 27 ++++++++
 paddle/fluid/eager/utils.h                    |  3 +
 python/paddle/tensor/manipulation.py          |  2 +-
 python/paddle/utils/code_gen/backward.yaml    |  8 +++
 8 files changed, 131 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 3f4fcc4608eeb..bca6577ffd64e 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -307,6 +307,23 @@ def ParseYamlBackward(args_str, returns_str):
     return inputs_list, attrs_list, returns_list
 
 
+def ParseYamlInplaceInfo(string):
+    # inplace_map_str: "(x -> out0), (y -> out2)"
+    inplace_map = {}
+    for pair in string.split(","):
+        pair = pair.strip()
+        if pair.startswith("("):
+            pair = pair[1:]
+
+        if pair.endswith(")"):
+            pair = pair[:-1]
+
+        key = pair.split("->")[0].strip()
+        val = pair.split("->")[1].strip()
+        inplace_map[key] = val
+    return inplace_map
+
+
 ########################
 ###  Generator Base  ###
 ########################
@@ -334,25 +351,14 @@ def __init__(self, forward_api_contents, namespace):
         self.optional_inputs = []  #[name, ...]
         self.no_need_buffers = []  #[name, ...]
         self.intermediate_outputs = []  #[name, ...]    
-        self.inplace_map = {}  #{name : name, ...}
+        self.forward_inplace_map = {}  #{name : name, ...}
 
-    def ParseInplaceInfo(self):
+    def ParseForwardInplaceInfo(self):
         forward_api_contents = self.forward_api_contents
         if 'inplace' not in forward_api_contents.keys(): return
 
-        # inplace_map_str: "(x -> out0), (y -> out2)"
         inplace_map_str = forward_api_contents['inplace']
-        for pair in inplace_map_str.split(","):
-            pair = pair.strip()
-            if pair.startswith("("):
-                pair = pair[1:]
-
-            if pair.endswith(")"):
-                pair = pair[:-1]
-
-            key = pair.split("->")[0].strip()
-            val = pair.split("->")[1].strip()
-            self.inplace_map[key] = val
+        self.forward_inplace_map = ParseYamlInplaceInfo(inplace_map_str)
 
     def ParseNoNeedBuffer(self):
         grad_api_contents = self.grad_api_contents
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 1ce5216ddce9d..9bee8f5f29753 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -29,6 +29,7 @@
 from codegen_utils import GetInplacedFunctionName
 from codegen_utils import ParseYamlArgs, ParseYamlReturns, ParseYamlForwardFromBackward
 from codegen_utils import ParseYamlForward, ParseYamlBackward
+from codegen_utils import ParseYamlInplaceInfo
 from codegen_utils import FunctionGeneratorBase, GeneratorBase
 from codegen_utils import ops_to_fill_zero_for_empty_grads
 from codegen_utils import AssertMessage, GetIndent
@@ -347,6 +348,16 @@ class {} : public egr::GradNodeBase {{
   if( {}.impl() ) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
 """
 
+CHECK_BACKWARD_INPLACE_TEMPLATE = \
+"""
+  bool can_be_inplaced = false;
+  if ({}.initialized()) {{
+    VLOG(10) << {}.name() << "({}) use_count: " << {}.impl().use_count();
+    if ({}.impl().use_count() == 1 || ({}.impl().use_count() == 2 && {}.impl().get() == {}.impl().get())) {{
+      can_be_inplaced = true;
+    }}
+  }}"""
+
 CHECK_NAN_AND_INF_TEMPLATE = \
 """  if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }}
 """
@@ -407,7 +418,7 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         #self.optional_inputs
         #self.no_need_buffers
         #self.intermediate_outputs
-        #self.inplace_map
+        #self.forward_inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
         self.grad_api_contents = grad_api_contents
@@ -438,6 +449,15 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
         self.backward_grad_outputs_map = {
         }  #{ "name" : [type, fwd_position, orig_position] ...}
 
+        self.backward_inplace_map = {}  #{name : name, ...}
+
+    def ParseBackwardInplaceInfo(self):
+        grad_api_contents = self.grad_api_contents
+        if 'inplace' not in grad_api_contents.keys(): return
+
+        inplace_map_str = grad_api_contents['inplace']
+        self.backward_inplace_map = ParseYamlInplaceInfo(inplace_map_str)
+
     def DygraphYamlValidationCheck(self):
         forward_api_contents = self.forward_api_contents
         grad_api_contents = self.grad_api_contents
@@ -777,8 +797,9 @@ def run(self):
         ##########################
         ## Parsing Raw Contents ##
         ##########################
-        # Parse inplace_map
-        self.ParseInplaceInfo()
+        # Parse forward and backward inplace_map
+        self.ParseForwardInplaceInfo()
+        self.ParseBackwardInplaceInfo()
 
         # Parse no_need_buffer
         self.ParseNoNeedBuffer()
@@ -837,7 +858,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
 
         optional_inputs = self.optional_inputs
         intermediate_outputs = self.intermediate_outputs
-        inplace_map = self.inplace_map if is_inplaced else {}
+        forward_inplace_map = self.forward_inplace_map if is_inplaced else {}
         indent = GetIndent(1)
 
         # Get Function Args
@@ -869,7 +890,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         f"auto NEW_{name} = ({name}.get_ptr() != nullptr) ? paddle::make_optional<const paddle::experimental::Tensor&>(NEW_{name}_temp_tensor) : {name};\n"
                     )
                 else:
-                    if is_inplaced and inplace_map and name in inplace_map.keys(
+                    if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
                     ):
                         arg_str = f"paddle::experimental::Tensor& {name}"
                         amp_tensors_vector_list.append(f"{{{name}}}")
@@ -944,13 +965,15 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             returns_list[pos] = f"{name}"
 
             if IsPlainTensorType(rtype):
-                if is_inplaced and inplace_map and name in inplace_map.values():
+                if is_inplaced and forward_inplace_map and name in forward_inplace_map.values(
+                ):
                     returns_type_list[pos] = "paddle::experimental::Tensor&"
                 else:
                     returns_type_list[pos] = "paddle::experimental::Tensor"
             else:
                 assert IsVectorTensorType(rtype)
-                if is_inplaced and inplace_map and name in inplace_map.values():
+                if is_inplaced and forward_inplace_map and name in forward_inplace_map.values(
+                ):
                     returns_type_list[
                         pos] = "std::vector<paddle::experimental::Tensor>&"
                 else:
@@ -1014,7 +1037,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         check_inplace_str = ""
         bump_inplace_version_str = ""
         if is_inplaced:
-            for inplace_name in inplace_map.keys():
+            for inplace_name in forward_inplace_map.keys():
                 inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
                 check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
                     inplace_name, inplace_autograd_meta_name)
@@ -1258,6 +1281,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
         backward_grad_inputs_map = self.backward_grad_inputs_map
         backward_grad_outputs_map = self.backward_grad_outputs_map
         backward_attrs_list = self.backward_attrs_list
+        backward_inplace_map = self.backward_inplace_map
         indent = GetIndent(1)
 
         # Construct grad_api function args
@@ -1282,6 +1306,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
                     else:
                         fill_zero_str += f"{indent}egr::EagerUtils::FillZeroForEmptyGradInput(&grads[{fwd_position}], input_metas[{fwd_position}]);\n"
 
+        inplace_grad_input_str = ""
         # Grad Ins from TensorWrappers
         for name, (_, is_fwd_input,
                    grad_api_position), in backward_forward_inputs_map.items():
@@ -1290,6 +1315,14 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
 
             is_optional = (name in self.optional_inputs)
             tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
+            if backward_inplace_map and name in backward_inplace_map.keys():
+                tensor_wrapper_intermidiate_tensor_str = f"(&this->{tensor_wrapper_name})->get_intermidiate_tensor()"
+                tensor_wrapper_recover_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
+                    transformed_tensor_name, transformed_tensor_name, name,
+                    transformed_tensor_name, transformed_tensor_name,
+                    transformed_tensor_name, transformed_tensor_name,
+                    tensor_wrapper_intermidiate_tensor_str)
+                inplace_grad_input_str = transformed_tensor_name
             if is_optional:
                 tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format(
                     transformed_tensor_name, transformed_tensor_name,
@@ -1312,6 +1345,16 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             if IsPlainTensorType(ttype):
                 get_tensor_str = f"{indent}auto& {transformed_tensor_name} = hooked_grads[{fwd_position}][0];"
 
+                # Inplace in backward op
+                if backward_inplace_map and name in backward_inplace_map.keys():
+                    grads_tensor_str = f"grads[{fwd_position}][0]"
+                    get_tensor_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format(
+                        transformed_tensor_name, transformed_tensor_name, name,
+                        transformed_tensor_name, transformed_tensor_name,
+                        transformed_tensor_name, transformed_tensor_name,
+                        grads_tensor_str)
+                    inplace_grad_input_str = transformed_tensor_name
+
                 if is_optional:
                     get_tensor_str += "\n" + CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE.format(
                         transformed_tensor_name, transformed_tensor_name,
@@ -1357,8 +1400,16 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             grad_api_args.append(f"api_output_{out_index}")
 
             if IsPlainTensorType(ttype):
+                inplace_for_grad_outs_str = ""
+                if backward_inplace_map and name in backward_inplace_map.values(
+                ):
+                    inplace_for_grad_outs_str = f"""
+{indent}if (api_output_{out_index} != nullptr && can_be_inplaced) {{
+{indent}  egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index});
+{indent}}}"""
+
                 grad_function_call_str += f"""
-  auto* api_output_{out_index} = (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][0].IsStopGradient()) ? nullptr : &returns[{fwd_position}][0];"""
+  auto* api_output_{out_index} = (out_metas[{fwd_position}].empty() || out_metas[{fwd_position}][0].IsStopGradient()) ? nullptr : &returns[{fwd_position}][0];{inplace_for_grad_outs_str}"""
 
             else:
                 assert IsVectorTensorType(ttype)
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 7672e49f368ce..602d38510c04f 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -259,7 +259,7 @@ def __init__(self, forward_api_contents, namespace):
         #self.optional_inputs
         #self.no_need_buffers
         #self.intermediate_outputs   
-        #self.inplace_map
+        #self.forward_inplace_map
         FunctionGeneratorBase.__init__(self, forward_api_contents, namespace)
 
         self.is_forward_only = True
@@ -275,7 +275,7 @@ def CollectIsForwardOnly(self):
 
     def GeneratePythonCFunction(self):
         namespace = self.namespace
-        inplace_map = self.inplace_map
+        forward_inplace_map = self.forward_inplace_map
         forward_api_name = self.forward_api_name
         orig_forward_attrs_list = self.orig_forward_attrs_list
         forward_inputs_position_map = self.forward_inputs_position_map
@@ -359,7 +359,7 @@ def GeneratePythonCFunction(self):
             forward_api_name_prefix, forward_api_name, namespace,
             forward_api_name, forward_api_name)
 
-        if inplace_map:
+        if forward_inplace_map:
             inplaced_forward_api_name = GetInplacedFunctionName(
                 self.forward_api_name)
             if is_forward_only:
@@ -372,9 +372,9 @@ def GeneratePythonCFunction(self):
                     GetForwardFunctionName(inplaced_forward_api_name))
 
             assert len(
-                inplace_map
-            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
-            for inplace_input, inplace_output in inplace_map.items():
+                forward_inplace_map
+            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(forward_inplace_map)}"
+            for inplace_input, inplace_output in forward_inplace_map.items():
                 return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
                     inplaced_forward_api_name, inplace_input,
                     inplaced_forward_api_name, inplace_output)
@@ -401,8 +401,8 @@ def run(self):
         # Initialized optional_inputs
         self.ParseDispensable()
 
-        # Initialized inplace_map
-        self.ParseInplaceInfo()
+        # Initialized forward_inplace_map
+        self.ParseForwardInplaceInfo()
 
         # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list
         self.CollectOriginalForwardInfo()
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 8893e0ed7ee0a..495f7f2e42c59 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -116,6 +116,10 @@ class TensorWrapper {
     return recovered_tensor;
   }
 
+  paddle::experimental::Tensor get_intermidiate_tensor() {
+    return intermidiate_tensor_;
+  }
+
   void clear() { intermidiate_tensor_.reset(); }
 
  private:
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index f253c4cb51380..d22f4316d5604 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -271,6 +271,33 @@ void EagerUtils::HandleViewBetweenInputAndOutput(
   }
 }
 
+void EagerUtils::HandleViewBetweenInputAndOutput(
+    const paddle::experimental::Tensor& input_tensor,
+    paddle::experimental::Tensor* view_output_tensor) {
+  PADDLE_ENFORCE_EQ(
+      input_tensor.initialized(), true,
+      paddle::platform::errors::InvalidArgument(
+          "Tensor %s has not been initialized!", input_tensor.name()));
+
+  if (input_tensor.is_dense_tensor()) {
+    auto input_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(input_tensor.impl());
+    if (view_output_tensor->impl() == nullptr) {
+      view_output_tensor->set_impl(std::make_shared<phi::DenseTensor>());
+    }
+    auto view_output_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(view_output_tensor->impl());
+    view_output_dense_tensor->ShareBufferWith(*input_dense_tensor);
+    view_output_dense_tensor->ShareInplaceVersionCounterWith(
+        *input_dense_tensor);
+
+    VLOG(3) << "Perform View between Output Tensor("
+            << view_output_tensor->name() << ") and Input Tensor("
+            << input_tensor.name()
+            << "), share allocation and inplace version.";
+  }
+}
+
 std::vector<paddle::experimental::Tensor> EagerUtils::GetOutputs(
     const std::vector<std::shared_ptr<EagerVariable>>& outs) {
   std::vector<paddle::experimental::Tensor> res;
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index b96244f0d138b..7f5864ec887ca 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -172,6 +172,9 @@ class EagerUtils {
   static void HandleViewBetweenInputAndOutput(
       const std::shared_ptr<EagerVariable>& input_var,
       const std::shared_ptr<EagerVariable>& view_output_var);
+  static void HandleViewBetweenInputAndOutput(
+      const paddle::experimental::Tensor& input_tensor,
+      paddle::experimental::Tensor* view_output_tensor);
 
   // TensorWrapper Utils
   static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw);
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 973f870d581cd..57785c16e60bb 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3169,7 +3169,7 @@ def reshape(x, shape, name=None):
                 item.numpy().item(0) if isinstance(item, Variable) else item
                 for item in shape
             ]
-            out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+            out = _C_ops.final_state_reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
             shape.stop_gradient = True
             out, _ = _C_ops.reshape2(x, shape)
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index a720c27543c06..ae2f6fbc1881d 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -66,6 +66,7 @@
     func : add_grad
   no_need_buffer : x, y
   backward : add_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : add_n_grad
   forward : add_n (Tensor[] x) -> Tensor(out)
@@ -383,6 +384,7 @@
   kernel :
     func : cross_entropy_with_softmax_grad
     data_type : softmax
+  inplace : (softmax -> input_grad)
 
 - backward_api : cross_grad
   forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
@@ -646,6 +648,7 @@
     data_type: out_grad
     backend: out_grad
     layout: out_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : flip_grad
   forward : flip (Tensor x, int[] axis) -> Tensor(out)
@@ -1492,6 +1495,7 @@
     backend: out_grad
     layout: out_grad
   backward : reshape_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : roi_align_grad
   forward : roi_align (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned) -> Tensor(out)
@@ -1563,6 +1567,7 @@
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, 0.0, bias_after_scale)
   backward : scale_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : scale_triple_grad
   forward : scale_double_grad (Tensor grad_grad_x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_grad_out)
@@ -1755,6 +1760,7 @@
     param: [xshape]
   kernel :
     func : squeeze_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
@@ -1802,6 +1808,7 @@
     func : subtract_grad
   no_need_buffer : x, y
   backward : subtract_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sum_double_grad
   forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
@@ -2025,6 +2032,7 @@
     param: [xshape]
   kernel :
     func : unsqueeze_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : where_grad
   forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)

From 0211a833a42cb7a2e378a1f172798b65632d276d Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 23 May 2022 15:32:19 +0800
Subject: [PATCH 012/109] Add double grad yaml for celu/sqrt/rsqrt/square op
 (#42895)

* add double grad yaml

* fix bugs when compile infrt
---
 .../final_state_generator/codegen_utils.py    |  3 +-
 paddle/phi/kernels/activation_kernel.h        |  2 +-
 .../unittests/test_activation_nn_grad.py      | 20 +++++++
 .../tests/unittests/test_activation_op.py     |  8 ++-
 python/paddle/nn/functional/activation.py     |  4 +-
 .../paddle/tensor/layer_function_generator.py | 10 +++-
 python/paddle/utils/code_gen/api.yaml         | 10 ++++
 python/paddle/utils/code_gen/backward.yaml    | 54 +++++++++++++++++++
 8 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index bca6577ffd64e..5b48fb74f5383 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -28,7 +28,8 @@
     "multiply_triple_grad", "conv2d_grad_grad", "batch_norm_double_grad",
     "tanh_double_grad", "tanh_triple_grad", "subtract_double_grad",
     "divide_double_grad", "log_double_grad", "elu_double_grad",
-    "leaky_relu_double_grad"
+    "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
+    "square_double_grad", "celu_double_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index a14f732b6c8b6..b719ceddc5563 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -78,7 +78,7 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta)
-DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(celu, alpha)
+DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha)
 
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max)
 DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 955f2117778f0..919ae52447128 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -253,6 +253,9 @@ def test_grad(self):
 
 
 class TestCELUDoubleGradCheck(unittest.TestCase):
+    def celu_wrapper(self, x):
+        return paddle.nn.functional.celu(x[0], alpha=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 4, 4]
@@ -269,6 +272,8 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.celu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -280,6 +285,9 @@ def test_grad(self):
 
 
 class TestSqrtDoubleGradCheck(unittest.TestCase):
+    def sqrt_wrapper(self, x):
+        return paddle.sqrt(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -294,6 +302,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.sqrt_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -305,6 +315,9 @@ def test_grad(self):
 
 
 class TestRsqrtDoubleGradCheck(unittest.TestCase):
+    def rsqrt_wrapper(self, x):
+        return paddle.rsqrt(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -319,6 +332,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -330,6 +345,9 @@ def test_grad(self):
 
 
 class TestSquareDoubleGradCheck(unittest.TestCase):
+    def square_wrapper(self, x):
+        return paddle.square(x[0])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -344,6 +362,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.square_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 58d8610ee352d..7be3b300d55a1 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -2003,6 +2003,7 @@ def setUp(self):
         self.op_type = "celu"
         self.init_dtype()
 
+        self.python_api = paddle.nn.functional.celu
         np.random.seed(1024)
         x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype)
         alpha = 1.5
@@ -2014,7 +2015,7 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_eager=True)
 
 
 class TestCELUAPI(unittest.TestCase):
@@ -2080,6 +2081,11 @@ def test_errors(self):
                 name='x_fp16', shape=[10, 12], dtype='float16')
             self.celu(x_fp16)
 
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_dygraph_api()
+            self.test_errors()
+
 
 class TestReciprocal(TestActivation):
     def setUp(self):
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index e64efda7b33bf..6970cf4962909 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -63,8 +63,10 @@ def celu(x, alpha=1.0, name=None):
     if alpha == 0:
         raise ZeroDivisionError("alpha cannot be 0 for celu")
 
-    if in_dynamic_mode():
+    if _in_legacy_dygraph():
         return _C_ops.celu(x, 'alpha', alpha)
+    if in_dygraph_mode():
+        return _C_ops.final_state_celu(x, alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index ecb13613a125e..7f95dd60eda8a 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -21,7 +21,7 @@
 from six.moves import cStringIO
 from ..static import Variable
 from ..fluid.proto import framework_pb2
-from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
@@ -256,7 +256,13 @@ def generate_activation_fn(op_type):
     op_proto = OpProtoHolder.instance().get_op_proto(op_type)
 
     def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        final_state_op_type = "final_state_%s" % op_type
+        if in_dygraph_mode() and hasattr(_C_ops, final_state_op_type):
+            op = getattr(_C_ops, final_state_op_type)
+            return op(x)
+        # TODO(dev): Because some ops' yaml has not been migrated.
+        # Replace it with _in_legacy_dygraph while all yaml work is done.
+        if _non_static_mode():
             op = getattr(_C_ops, op_type)
             return op(x)
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 54a5100c892fc..6c15b4a012833 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -319,6 +319,16 @@
     func : ceil
   backward : ceil_grad
 
+- api : celu
+  args : (Tensor x, float alpha)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu
+  backward : celu_grad
+
 # cholesky
 - api : cholesky
   args : (Tensor x, bool upper)
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index ae2f6fbc1881d..9b3d2d94c9341 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -232,6 +232,27 @@
   kernel :
     func : ceil_grad
 
+- backward_api : celu_double_grad
+  forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : celu_double_grad
+
+- backward_api : celu_grad
+  forward : celu(Tensor x, float alpha) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, float alpha)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : celu_grad
+  backward : celu_double_grad
+
 - backward_api : cholesky_grad
   forward : cholesky (Tensor x, bool upper) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, bool upper)
@@ -1544,6 +1565,16 @@
   kernel :
     func : round_grad
 
+- backward_api : rsqrt_double_grad
+  forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : rsqrt_double_grad
+
 - backward_api : rsqrt_grad
   forward : rsqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1553,6 +1584,7 @@
     param : [out]
   kernel :
     func : rsqrt_grad
+  backward : rsqrt_double_grad
 
 - backward_api : scale_double_grad
   forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
@@ -1731,6 +1763,16 @@
   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
+- backward_api : sqrt_double_grad
+  forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : sqrt_double_grad
+
 - backward_api : sqrt_grad
   forward : sqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1740,6 +1782,17 @@
     param : [out]
   kernel :
     func : sqrt_grad
+  backward : sqrt_double_grad
+
+- backward_api : square_double_grad
+  forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : square_double_grad
 
 - backward_api : square_grad
   forward : square (Tensor x) -> Tensor(out)
@@ -1750,6 +1803,7 @@
     param : [x]
   kernel :
     func : square_grad
+  backward : square_double_grad
 
 - backward_api : squeeze_grad
   forward : squeeze(Tensor x, int[] axes) -> Tensor(out), Tensor(xshape)

From d414af940a956b51c0586b14f5b65265284bfe1a Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Mon, 23 May 2022 09:38:36 +0200
Subject: [PATCH 013/109] [Internal reviewing] NHWC fix to am_vocoder model for
 oneDNN 2.6 (#42729)

* - prototype of reimplemented fixes

* - compilation fixes

* - compilation fix

* - cosmetic info

* - hopefully fix

* - compilation fix

* - supported for nested blocking of cache clearing

* - fix

* - Unit test to changes

* - Compilation fix to windows (hopefully)

* - Moved resetting layout to ResetBlob

* - fixes after review
---
 paddle/fluid/framework/operator.cc            |  3 +-
 .../controlflow/conditional_block_op.cc       | 10 +++
 .../fluid/operators/controlflow/while_op.cc   |  9 +++
 paddle/fluid/operators/crop_op.cc             |  2 +-
 .../operators/mkldnn/nhwc_op_tests.cmake      |  2 +-
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   | 65 +++++++++++++++++++
 paddle/fluid/platform/device_context.cc       | 23 +++++--
 paddle/fluid/platform/device_context.h        |  3 +-
 paddle/fluid/platform/mkldnn_helper.h         |  2 -
 9 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 18287f0c7a4ee..d8eab0e9a7297 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1908,7 +1908,8 @@ Scope* OperatorWithKernel::PrepareData(
             (var->IsType<LoDTensor>() == true) &&
             (expected_kernel_key.data_layout_ != DataLayout::kMKLDNN) &&
             (paddle::platform::MKLDNNDeviceContext::tls()
-                 .get_cur_paddle_data_layout() == DataLayout::kNHWC)) {
+                 .get_cur_paddle_data_layout() == DataLayout::kNHWC) &&
+            (tensor_in->dims().size() >= 3)) {
           // Mixed execution : MKL-DNN and GPU is not supported!
           if (!new_scope) {
             new_scope = &scope.NewScope();
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 6bf419c47a566..fd06e33a6bb6e 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/assign_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -65,6 +69,12 @@ class ConditionalBlockOp : public ConditionalOp {
       scopes->resize(1);
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
+#ifdef PADDLE_WITH_MKLDNN
+      // (jczaja) Executor on being destroyed clears oneDNN cache and
+      // reset registered model data layout. This is unwanted for nested
+      // Executors (executors declared inside control ops)
+      platform::DontClearMKLDNNCache(dev_place);
+#endif
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       VLOG(3) << "Conditional block.idx = " << block->ID()
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index eb44655c88f18..d8daa25f31be8 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -17,6 +17,9 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -66,6 +69,12 @@ class WhileOp : public framework::OperatorBase {
             "the Condition's shape is ",
             cond.dims().to_str(), ".\n"));
 
+#ifdef PADDLE_WITH_MKLDNN
+    // (jczaja) Executor on being destroyed clears oneDNN cache and
+    // resets registered model data layout. This is unwanted for nested
+    // Executors (executors declared inside control ops)
+    platform::DontClearMKLDNNCache(dev_place);
+#endif
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index f2beb4cec212e..9de5bc6ea3636 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -97,7 +97,7 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+1. In runtime: Using the input 'Offsets', which is a Variable and can be 
                output of other operators. This way is suitable for 
                dynamic offsets.
 2. In network configuration: Using the attribute 'offsets', which will be 
diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
index 3ebfbdc50caab..8bad3e86b2934 100644
--- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
+++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake
@@ -1 +1 @@
-cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op activation_op pooling transpose_op scope device_context enforce executor)
+cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op crop_op activation_op pooling transpose_op scope device_context enforce executor)
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 4ff93ee3cd624..b9866ba8c3647 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -34,6 +34,8 @@ USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 USE_OP_ITSELF(shape);
 USE_OP_DEVICE_KERNEL(shape, MKLDNN);
+USE_OP_ITSELF(crop);
+USE_OP_DEVICE_KERNEL(crop, CPU);
 
 PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
@@ -211,5 +213,68 @@ TEST(test_pool2d_shape_nhwc, cpu_place) {
                         "Computed shape does not match expected shape"));
 }
 
+TEST(test_pool2d_crop_nhwc, cpu_place) {
+  framework::DDim dims({1, 4, 8, 512});           // NHWC shape
+  framework::DDim expected_dims({1, 3, 7, 512});  // NCHW expected shape
+  platform::CPUPlace p;
+  framework::Scope scope;
+
+  InputVars input_name = {"x",
+                          scope.Var("x")->GetMutable<framework::LoDTensor>()};
+  InputVars second_crop_input_name = {
+      "v", scope.Var("v")->GetMutable<framework::LoDTensor>()};
+  // Initialize input data
+  std::uniform_real_distribution<float> dist(10.0f, 20.0f);
+  std::mt19937 engine;
+  size_t numel = static_cast<size_t>(phi::product(dims));
+  input_name.tensor->Resize(dims);
+  auto data_ptr = input_name.tensor->mutable_data<float>(p);
+  for (size_t i = 0; i < numel; ++i) {
+    data_ptr[i] = dist(engine);
+  }
+  // Second input (Y) to crop is having no buffer
+  // but as it is MKLDNN then its shape order should be NCHW
+  auto expected_dims_nchw = phi::vectorize<int64_t>(expected_dims);
+  std::rotate(expected_dims_nchw.begin() + 1, expected_dims_nchw.end() - 1,
+              expected_dims_nchw.end());
+  second_crop_input_name.tensor->Resize(phi::make_ddim(expected_dims_nchw));
+  const auto second_crop_input_md =
+      dnnl::memory::desc(expected_dims_nchw, dnnl::memory::data_type::f32,
+                         dnnl::memory::format_tag::nhwc);
+  second_crop_input_name.tensor->set_mem_desc(second_crop_input_md);
+
+  scope.Var("y")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("z")->GetMutable<framework::LoDTensor>();
+
+  auto &pool = platform::DeviceContextPool::Instance();
+
+  // Make pool2d followed by crop. crop may have Y input as
+  // non buffered so the path to be executed is handling oneDNN kernel
+  // that is followed by CPU kernel with non-buffered Input
+
+  auto ksize = std::vector<int>(2, 2);
+  auto op_pool = framework::OpRegistry::CreateOp(
+      "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}},
+      {{"pooling_type", {std::string("max")}},
+       {"ksize", {ksize}},
+       {"data_format", {std::string("NHWC")}},
+       {"use_mkldnn", {true}}});
+
+  std::vector<int> offsets{0, 0, 0, 0};
+  auto op_crop = framework::OpRegistry::CreateOp(
+      "crop", {{"X", {"y"}}, {"Y", {"v"}}}, {{"Out", {"z"}}},
+      {{"offsets", {offsets}}});
+
+  op_pool->Run(scope, p);
+  op_crop->Run(scope, p);
+
+  pool.Get(p)->Wait();
+
+  // Verify shape of output
+  PADDLE_ENFORCE_EQ(z->dims(), expected_dims,
+                    platform::errors::InvalidArgument(
+                        "Output shape does not match expected output shape"));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 0bf5ca7f8f525..09a29c3429cba 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -750,7 +750,7 @@ dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
 void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   VLOG(4) << tls().get_curr_exec() << " " << ptr;
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  if (!block_next_cache_clearing_) {
+  if (block_next_cache_clearing_ == 0) {
     VLOG(3) << "Clearing DNNL cache.";
     // If no specific executor pointer then clear
     // everything. For executor pointer then clear only
@@ -768,9 +768,20 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
         s.second->erase(ptr);
       }
     }
+    // Reset paddle layout to NCHW
+    VLOG(3) << "Resetting Paddle data layout to NCHW.";
+    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
+        paddle::framework::DataLayout::kNCHW);
   } else {
-    VLOG(3) << "Prevented Clearing DNNL cache.";
-    block_next_cache_clearing_ = false;
+    --block_next_cache_clearing_;
+    VLOG(3) << "Prevented Clearing DNNL cache. Updated "
+               "block_next_cache_clearing_ : "
+            << block_next_cache_clearing_;
+    PADDLE_ENFORCE_GE(block_next_cache_clearing_, 0,
+                      platform::errors::InvalidArgument(
+                          "Cache clearing mark should be non-negative "
+                          ". But received %d.",
+                          block_next_cache_clearing_));
   }
 }
 
@@ -796,8 +807,10 @@ void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
 
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
-  VLOG(3) << "Next DNNL cache clearing has been blocked.";
-  block_next_cache_clearing_ = true;
+  ++block_next_cache_clearing_;
+  VLOG(3) << "Next DNNL cache clearing has been blocked. Updated "
+             "block_next_cache_clearing_ : "
+          << block_next_cache_clearing_;
 }
 
 size_t MKLDNNDeviceContext::GetShapeBlobSize() const {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2b53ecf86a641..a63d41405f1b2 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -850,7 +850,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   // to erase
   std::shared_ptr<ExecShape> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
-  bool block_next_cache_clearing_ = false;
+  // 0 - clearing is allowed. x > 0 do not clear.
+  unsigned int block_next_cache_clearing_ = 0;
 };
 #endif
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 94c0124440ea9..5e77046962931 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -148,8 +148,6 @@ inline void ClearMKLDNNCache(const platform::Place& place,
     platform::MKLDNNDeviceContext* dev_ctx =
         (platform::MKLDNNDeviceContext*)pool.Get(place);
     dev_ctx->ResetBlobMap(ptr);
-    platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
-        paddle::framework::DataLayout::kNCHW);
   }
 }
 

From fba94b9f1efee2530dab9e69cb35e28c3ac92a06 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Mon, 23 May 2022 21:46:53 +0800
Subject: [PATCH 014/109] [Eager] Remove _enable_legacy for bfgs (#42936)

---
 python/paddle/fluid/tests/unittests/test_bfgs.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_bfgs.py b/python/paddle/fluid/tests/unittests/test_bfgs.py
index 1a12913bc72e9..8a9f9f72aa068 100644
--- a/python/paddle/fluid/tests/unittests/test_bfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_bfgs.py
@@ -22,9 +22,6 @@
 from paddle.incubate.optimizer.functional.bfgs import minimize_bfgs
 from paddle.fluid.framework import _test_eager_guard
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 np.random.seed(123)
 
 

From e3ee2ad845d6169f2596ec850a6527aca4330478 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Mon, 23 May 2022 22:02:01 +0800
Subject: [PATCH 015/109] sync stop_gradient in ParamBase. Fix the Different
 Behavior between Eval and Train (#42899)

---
 python/paddle/fluid/dygraph/varbase_patch_methods.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 8049a8b8741b1..add3d73efc7e1 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -101,8 +101,11 @@ def _to_static_var(self, to_parameter=False, **kwargs):
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T', 'place', '_place_str']
+        param_keys = ['stop_gradient', 'trainable']
         if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
+            for key in param_keys:
+                attr_kwargs[key] = getattr(self, key)
         else:
             attr_names = []
             for name in dir(self):

From 615d931c0a08f9a41d4e2a7a2f55cba07e691dc9 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Mon, 23 May 2022 22:06:38 +0800
Subject: [PATCH 016/109] Support to onnx test (#42698)

* support to onnx test

* add comments

* remove log

* remove log

* update paddle2onnx version
---
 .../fluid/tests/unittests/test_onnx_export.py | 41 ++++++++++---------
 python/unittest_py/requirements.txt           |  2 +-
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
index 5efd586d849d1..07016d4290102 100644
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -21,7 +21,7 @@
 import paddle
 from paddle.static import InputSpec
 
-from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import in_dygraph_mode, _test_eager_guard
 
 
 class LinearNet(paddle.nn.Layer):
@@ -45,43 +45,46 @@ def forward(self, x, y, z):
 
 
 class TestExportWithTensor(unittest.TestCase):
-    def setUp(self):
+    def func_with_tensor(self):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, 128], dtype='float32')
-
-    def test_with_tensor(self):
-        if in_dygraph_mode():
-            return
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
 
+    def test_with_tensor(self):
+        with _test_eager_guard():
+            self.func_with_tensor()
+        self.func_with_tensor()
+
 
 class TestExportWithTensor1(unittest.TestCase):
-    def setUp(self):
+    def func_with_tensor(self):
         self.x = paddle.to_tensor(np.random.random((1, 128)))
-
-    def test_with_tensor(self):
-        if in_dygraph_mode():
-            return
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x])
 
+    def test_with_tensor(self):
+        with _test_eager_guard():
+            self.func_with_tensor()
+        self.func_with_tensor()
+
 
 class TestExportPrunedGraph(unittest.TestCase):
-    def setUp(self):
+    def func_prune_graph(self):
+        model = Logic()
         self.x = paddle.to_tensor(np.array([1]))
         self.y = paddle.to_tensor(np.array([-1]))
-
-    def test_prune_graph(self):
-        if in_dygraph_mode():
-            return
-        model = Logic()
         paddle.jit.to_static(model)
         out = model(self.x, self.y, z=True)
         paddle.onnx.export(
             model, 'pruned', input_spec=[self.x], output_spec=[out])
 
+    def test_prune_graph(self):
+        # test eager
+        with _test_eager_guard():
+            self.func_prune_graph()
+        self.func_prune_graph()
+
 
 if __name__ == '__main__':
-    if not in_dygraph_mode():
-        unittest.main()
+    unittest.main()
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 7a48ff0148e76..ea82c46b95c5e 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -8,7 +8,7 @@ pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
 visualdl
-paddle2onnx>=0.8.2
+paddle2onnx>=0.9.6
 scipy>=1.6; python_version >= "3.7"
 scipy>=1.5; python_version == "3.6"
 prettytable

From c921a812bdb08ce8d3abfc472cb492462f740d71 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 23 May 2022 22:07:24 +0800
Subject: [PATCH 017/109] fix conv nd error (#42933)

---
 python/paddle/nn/functional/conv.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 84aadbbac649b..6c7f09091ff3c 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -129,10 +129,13 @@ def _conv_nd(x,
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
-            tmp_bias = _C_ops.final_state_reshape(
-                bias, bias.shape +
-                [1 for i in range(len(x.shape) - channel_dim - 1)])
-            return _C_ops.final_state_add(pre_bias, tmp_bias)
+            if len(bias.shape) < len(x.shape):
+                tmp_bias = _C_ops.final_state_reshape(
+                    bias, bias.shape +
+                    [1 for i in range(len(x.shape) - channel_dim - 1)])
+                return _C_ops.final_state_add(pre_bias, tmp_bias)
+            else:
+                return _C_ops.final_state_add(pre_bias, bias)
         else:
             return pre_bias
     if in_dynamic_mode():

From c60acca4a26264a98785da351f75ca7065edb407 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Mon, 23 May 2022 22:43:52 +0800
Subject: [PATCH 018/109] Add assign_out_ yaml (#42833)

* add assign_out_ yaml

* fix final_state_assign

* fix inplace bug

* add inplace_check_blacklist for assign

* fix merge conflict
---
 .../final_state_generator/codegen_utils.py    |  5 +-
 .../final_state_generator/eager_gen.py        | 50 +++++++++++++------
 .../final_state_generator/python_c_gen.py     | 15 ++++--
 python/paddle/tensor/creation.py              | 14 +++---
 python/paddle/utils/code_gen/api.yaml         | 12 +++++
 python/paddle/utils/code_gen/api_base.py      |  4 +-
 python/paddle/utils/code_gen/backward.yaml    | 10 +++-
 7 files changed, 83 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 5b48fb74f5383..9849dc48fc490 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -174,7 +174,10 @@ def RecoverBaseNameOfInplaceFunction(function_name):
 
 
 def GetInplacedFunctionName(function_name):
-    return function_name + "_"
+    inplace_func_name = function_name
+    if inplace_func_name[-1] != '_':
+        inplace_func_name += '_'
+    return inplace_func_name
 
 
 def GetForwardFunctionName(string):
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 9bee8f5f29753..403216813dd36 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -34,6 +34,13 @@
 from codegen_utils import ops_to_fill_zero_for_empty_grads
 from codegen_utils import AssertMessage, GetIndent
 
+# Note: assign is a inplace api when parameter(output) isn't none,
+# so we should check parameter(output) with rule of inplace.
+# But because there is no check in old dygraph mode, in order to
+# keeping the code compatible, here we also skip inplace check in new dygraph temporarily,
+# and this will be fixed in the futrue.
+inplace_check_blacklist = set(["assign_out_"])
+
 
 ###########
 ## Utils ##
@@ -848,13 +855,15 @@ def __init__(self, forward_api_contents, grad_api_contents, namespace):
 
     def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         namespace = self.namespace
-
+        if self.forward_api_name[-1] == '_' and not is_inplaced:
+            return
         forward_api_name = GetInplacedFunctionName(
             self.forward_api_name) if is_inplaced else self.forward_api_name
 
         forward_inputs_position_map = self.forward_inputs_position_map
         forward_outputs_position_map = self.forward_outputs_position_map
         forward_attrs_list = self.forward_attrs_list
+        backward_grad_outputs_map = self.backward_grad_outputs_map
 
         optional_inputs = self.optional_inputs
         intermediate_outputs = self.intermediate_outputs
@@ -994,17 +1003,26 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         inputs_autograd_meta_list = []
         compute_require_grad_args_list = ["trace_backward"]
         for name, (ttype, pos) in forward_inputs_position_map.items():
-            input_autograd_meta_name = GetAutoGradMetaName(name)
-            if IsPlainTensorType(ttype):
-                input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
-            else:
-                assert IsVectorTensorType(ttype)
-                input_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
-                input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
-                input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
-
-            inputs_autograd_meta_list.append(input_autograd_meta)
-            compute_require_grad_args_list.append(input_autograd_meta_name)
+            # Has corresponding grad output
+            has_corresponding_grad_output = False
+            for _, (_, corresponding_pos,
+                    _) in backward_grad_outputs_map.items():
+                if pos == corresponding_pos:
+                    has_corresponding_grad_output = True
+            if has_corresponding_grad_output or (
+                    name in forward_inplace_map and
+                    forward_api_name not in inplace_check_blacklist):
+                input_autograd_meta_name = GetAutoGradMetaName(name)
+                if IsPlainTensorType(ttype):
+                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});"
+                else:
+                    assert IsVectorTensorType(ttype)
+                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                        name)
+                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({name});\n"
+                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                inputs_autograd_meta_list.append(input_autograd_meta)
+                compute_require_grad_args_list.append(input_autograd_meta_name)
         inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
         compute_require_grad_args_str = ",".join(compute_require_grad_args_list)
 
@@ -1038,9 +1056,11 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         bump_inplace_version_str = ""
         if is_inplaced:
             for inplace_name in forward_inplace_map.keys():
-                inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
-                check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
-                    inplace_name, inplace_autograd_meta_name)
+                if forward_api_name not in inplace_check_blacklist:
+                    inplace_autograd_meta_name = GetAutoGradMetaName(
+                        inplace_name)
+                    check_inplace_str += CHECK_INPLACE_TEMPLATE.format(
+                        inplace_name, inplace_autograd_meta_name)
                 bump_inplace_version_str += BUMP_INPLACE_VERSION_TEMPLATE.format(
                     inplace_name, inplace_name)
 
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 602d38510c04f..c02400299dfa6 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -381,7 +381,7 @@ def GeneratePythonCFunction(self):
                 break
 
             # Generate Python-C Function Definetion
-            self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
+            python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format(
                 inplaced_forward_api_name, pythonc_record_event_str,
                 inplaced_forward_api_name, get_eager_tensor_str,
                 parse_attributes_str, set_device_str,
@@ -389,11 +389,20 @@ def GeneratePythonCFunction(self):
                 inplaced_fwd_function_name, dygraph_function_call_str,
                 return_str)
 
-            # Generate Python-C Function Registration
-            self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
+            python_c_inplace_func_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
                 forward_api_name_prefix, inplaced_forward_api_name, namespace,
                 inplaced_forward_api_name, inplaced_forward_api_name)
 
+            # self.forward_api_name ending with '_' means it only has inplace api
+            if self.forward_api_name[-1] == '_':
+                self.python_c_function_str = python_c_inplace_func_str
+                # Generate Python-C Function Registration
+                self.python_c_function_reg_str = python_c_inplace_func_reg_str
+            else:
+                self.python_c_function_str += python_c_inplace_func_str
+                # Generate Python-C Function Registration
+                self.python_c_function_reg_str += "\n," + python_c_inplace_func_reg_str
+
     def run(self):
         # Initialized is_forward_only
         self.CollectIsForwardOnly()
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c7e73cec47bea..d3430ba81b859 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1510,12 +1510,14 @@ def assign(x, output=None):
     # isinstance(VarBase, Variable) == False. It will cause return None
     # after this api.
     if isinstance(input, (Variable, core.VarBase)):
-        if _non_static_mode():
+        if in_dygraph_mode():
             if output is None:
-                if _in_legacy_dygraph():
-                    output = core.VarBase()
-                else:
-                    output = core.eager.Tensor()
+                output = _C_ops.final_state_assign(input)
+            else:
+                _C_ops.final_state_assign_out_(input, output)
+        elif _in_legacy_dygraph():
+            if output is None:
+                output = core.VarBase()
             _C_ops.assign(input, output)
         else:
             check_dtype(input.dtype, 'input', [
@@ -1575,7 +1577,7 @@ def assign(x, output=None):
                 value_name: values
             })
 
-    if is_inplace and _non_static_mode():
+    if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
 
     return output
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 6c15b4a012833..1a740f47f46f5 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -189,6 +189,18 @@
     func : assign
   backward : assign_grad
 
+- api : assign_out_
+  args : (Tensor x, Tensor output)
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : assign
+    param : [x]
+  inplace : (output -> out)
+  backward : assign_out__grad
+
 # atan
 - api : atan
   args : (Tensor x)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 96896b65f4041..ac9a431593776 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -311,7 +311,7 @@ def parse_inplace_and_view(self, api_item_yaml):
                     view_map = {}
                 in_out_mapping_list = api_item_yaml[mode].split(',')
                 for item in in_out_mapping_list:
-                    result = re.search(r"(?P<in>\w+)\s*->\s(?P<out>\w+)", item)
+                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
                     in_val = result.group('in')
                     out_val = result.group('out')
                     assert in_val in self.inputs['names'], \
@@ -840,6 +840,8 @@ def gene_api_code(self):
         if self.is_base_api:
             api_code = self.gene_base_api_code()
             if len(self.inplace_map) > 0:
+                if self.api[-1] == '_':
+                    api_code = ""
                 api_code = api_code + self.gene_base_api_code(inplace_flag=True)
             return api_code
 
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 9b3d2d94c9341..19343c5873db6 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -132,7 +132,15 @@
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
-    param : [out_grad]
+  kernel :
+    func : assign
+
+- backward_api : assign_out__grad
+  forward : assign_out_ (Tensor x, Tensor output) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : assign
 

From d3c6afbff5933e306920dd351e0cfe0791b6d10a Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Tue, 24 May 2022 10:01:30 +0800
Subject: [PATCH 019/109] Add type() interface for paddle::variant (#42943)

* Add type() interface for variant

* Fix CI errors
---
 paddle/utils/CMakeLists.txt  |  4 +++-
 paddle/utils/variant.h       | 21 +++++++++++++++++++++
 paddle/utils/variant_test.cc | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 paddle/utils/variant_test.cc

diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 64c88a47b4393..7669c06b2c2b7 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,3 +1,5 @@
 add_subdirectory(string)
-cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+
 cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags)
+cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
+cc_test(variant_test SRCS variant_test.cc DEPS gtest)
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 50bdc4287e21a..4348abc9cbff0 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -2199,6 +2199,18 @@ class impl : public copy_assignment<traits<Ts...>> {
     }
   }
 
+  inline const std::type_info &type() const {
+    return visitation::alt::visit_alt_at(
+        this->index(),
+#ifdef MPARK_GENERIC_LAMBDAS
+        [](auto &alt) -> const std::type_info & { return typeid(alt.value); }
+#else
+        typer {}
+#endif
+        ,
+        *this);
+  }
+
  private:
 #ifndef MPARK_GENERIC_LAMBDAS
   struct swapper {
@@ -2208,6 +2220,13 @@ class impl : public copy_assignment<traits<Ts...>> {
       swap(this_alt.value, that_alt.value);
     }
   };
+
+  struct typer {
+    template <typename Alt>
+    inline const std::type_info &operator()(Alt &alt) const {
+      return typeid(alt.value);
+    }
+  };
 #endif
 
   inline constexpr bool move_nothrow() const {
@@ -2432,6 +2451,8 @@ class variant {
     impl_.swap(that.impl_);
   }
 
+  inline const std::type_info &type() noexcept { return impl_.type(); }
+
  private:
   detail::impl<Ts...> impl_;
 
diff --git a/paddle/utils/variant_test.cc b/paddle/utils/variant_test.cc
new file mode 100644
index 0000000000000..e690269d801c1
--- /dev/null
+++ b/paddle/utils/variant_test.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/utils/variant.h"
+#include "gtest/gtest.h"
+#include "paddle/phi/core/enforce.h"
+
+TEST(interface_test, type) {
+  using phi::enforce::demangle;
+
+  paddle::variant<bool, int, float> var;
+
+  var = true;
+  EXPECT_EQ(demangle(var.type().name()), "bool");
+
+  var = 0;
+  EXPECT_EQ(demangle(var.type().name()), "int");
+
+  var = 0.f;
+  EXPECT_EQ(demangle(var.type().name()), "float");
+}

From a5ad2659131fb0e753690d93311f6c842cfc46e2 Mon Sep 17 00:00:00 2001
From: Zhangjingyu06 <92561254+Zhangjingyu06@users.noreply.github.com>
Date: Tue, 24 May 2022 11:00:03 +0800
Subject: [PATCH 020/109] modify xpu.cmake *test=kunlun (#42928)

---
 cmake/external/xpu.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 1c4a424995887..a3287d6bfd94e 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220511")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220520")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()

From ebf486acb8accd341cf19dc9667f365de0bdd57d Mon Sep 17 00:00:00 2001
From: kuizhiqing <kuizhiqing@baidu.com>
Date: Tue, 24 May 2022 11:16:57 +0800
Subject: [PATCH 021/109] [launch] fix timeout reset (#42941)

---
 python/paddle/distributed/launch/context/__init__.py |  7 +++++++
 .../paddle/distributed/launch/context/args_envs.py   |  4 ++--
 .../distributed/launch/controllers/__init__.py       |  1 +
 .../distributed/launch/controllers/collective.py     |  6 +++++-
 .../paddle/distributed/launch/controllers/master.py  | 12 +++++++++++-
 python/paddle/distributed/launch/controllers/ps.py   |  2 ++
 python/paddle/distributed/launch/plugins/__init__.py |  3 ++-
 python/paddle/fluid/tests/unittests/test_run.py      |  4 ++--
 8 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index 08c8f0835c5e1..fbea5d0db869e 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -17,6 +17,7 @@
 from .node import Node
 from .status import Status
 from .args_envs import parse_args, fetch_envs, env_args_mapping
+import six
 
 import logging
 
@@ -39,6 +40,12 @@ def __init__(self, enable_plugin=True):
         if enable_plugin:
             self._enable_plugin()
 
+    def print(self):
+        self.logger.info("-----------  Configuration  ----------------------")
+        for arg, value in sorted(six.iteritems(vars(self.args))):
+            self.logger.info("%s: %s" % (arg, value))
+        self.logger.info("--------------------------------------------------")
+
     def is_legacy_mode(self):
         if self.args.legacy:
             return True
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index b624281e44db3..ea8bf3d597a79 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -85,7 +85,7 @@ def parse_args():
     base_group.add_argument(
         "--run_mode",
         type=str,
-        default="collective",
+        default=None,
         help="run mode of the job, collective/ps/ps-heter")
 
     base_group.add_argument(
@@ -125,7 +125,7 @@ def parse_args():
     ps_group.add_argument(
         "--gloo_port", type=int, default=6767, help="gloo http port")
     ps_group.add_argument(
-        "--with_gloo", type=str, default="0", help="use gloo or not")
+        "--with_gloo", type=str, default="1", help="use gloo or not")
 
     # parameter elastic mode
     elastic_group = parser.add_argument_group("Elastic Parameters")
diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
index 706131300f0d8..f1c6ea5399a46 100644
--- a/python/paddle/distributed/launch/controllers/__init__.py
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -29,4 +29,5 @@
 def init(ctx):
     for c in _controllers:
         if c.enable(ctx):
+            ctx.print()
             return c(ctx)
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 3763bac041451..5225fd6e81ff1 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .controller import Controller
+from .controller import Controller, ControleMode
 
 import json
 import os
@@ -23,8 +23,10 @@
 class CollectiveController(Controller):
     @classmethod
     def enable(cls, ctx):
+        # collective is the default mode
         if ctx:
             ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.COLLECTIVE
             return True
         else:
             return False
@@ -85,6 +87,7 @@ def build_pod(self):
                 "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas),
                 "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset),
                 "PADDLE_LOCAL_RANK": "{}".format(i),
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 ## compatible env
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints),
                 "PADDLE_CURRENT_ENDPOINT": endpoints[i],
@@ -106,6 +109,7 @@ class CollectiveElasticController(CollectiveController):
     def enable(cls, ctx):
         if ctx.args.master and ctx.args.master.startswith("etcd://"):
             ctx.logger.debug("{} enabled".format(cls.__name__))
+            ctx.args.run_mode = ControleMode.COLLECTIVE
             return True
         else:
             return False
diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
index 43eda4cdffa24..742fea9e16de7 100644
--- a/python/paddle/distributed/launch/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -276,10 +276,20 @@ def fetch_peer_alive(self):
         return peer_alive
 
     def wait_peer_ready(self, replicas_min, replicas_max, timeout):
+        timeout = timeout if timeout > 1 else 3
+
         end = time.time() + timeout
+        np_pre = len(self.fetch_peer_alive())
         while not self.ctx.status.is_done() and time.time() < end:
-            if len(self.fetch_peer_alive()) == replicas_max:
+            np = len(self.fetch_peer_alive())
+            if np == replicas_max:
+                # maximum replicas reached, return immediately
                 return (True, replicas_max)
+            elif np != np_pre:
+                # replicas are changing, reset timeout
+                end = time.time() + timeout
+                np_pre = np
+                time.sleep(0.2)
             else:
                 time.sleep(0.5)
 
diff --git a/python/paddle/distributed/launch/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
index 6504f1240ee09..037bd313bbc03 100644
--- a/python/paddle/distributed/launch/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -171,6 +171,7 @@ def _build_pod_with_master(self):
 
         for i in range(server_num):
             e = {
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
                 "PADDLE_PORT":
@@ -186,6 +187,7 @@ def _build_pod_with_master(self):
 
         for i in range(trainer_num):
             e = {
+                "PADDLE_NNODES": "{}".format(self.job.replicas),
                 "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints),
                 "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints),
                 "PADDLE_PORT":
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
index 35a44ed942c20..13c09b4c27c26 100644
--- a/python/paddle/distributed/launch/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -17,6 +17,7 @@
 __all__ = []
 
 
+# print configuration after args are well filled in controller init
 def log(ctx):
     ctx.logger.info("-----------  Configuration  ----------------------")
     for arg, value in sorted(six.iteritems(vars(ctx.args))):
@@ -59,4 +60,4 @@ def rewrite_host_ip(ctx):
         ctx.node.ip = ctx.args.host
 
 
-enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log]
+enabled_plugins = [collective_compatible, rewrite_host_ip, process_args]
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 28bcc379fb9a0..c0157c5b9068c 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -95,7 +95,7 @@ def test_collective_3(self):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --nnodes 2".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
@@ -143,7 +143,7 @@ def test_ps_3(self):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+        args = "--job_id ps3 --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)

From f8931c97985fac563dd095a6e81326ee4cfa8fb5 Mon Sep 17 00:00:00 2001
From: Fan Zhang <frank08081993@gmail.com>
Date: Tue, 24 May 2022 14:39:27 +0800
Subject: [PATCH 022/109] [XPUPS] Modify XPU Kernel (#42745)

* Adapt XPUPS - 1st version - 3.24

* Adapt XPUPS - update XPU PushSparse -  2nd version - 3.24

* Adapt XPUPS - add XPU PullSparseOp - 3nd version - 3.25

* refactor heter comm kernel

* update. test=develop

* Adapt XPUPS - modify by compilation - 4th version - 3.27

* update calc_shard_offset. test=develop

* update xpu kernel. test=develop

* update args of calc_shard_offset

* update. test=develop

* remove customGradMerger

* update. test=develop

* heter_comm update

* heter_comm update

* update calc_shard_offset. test=develop

* heter_comm update

* update args of calc_shard_offset

* update. test=develop

* remove customGradMerger

* update. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop

* update optimizer kernel

* Adapt XPUPS - use WITH_XPU_KP and modify wrapper kernel function - 5th version - 3.30

* update. test=develop

* update pslib.cmake

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* Adapt XPUPS - modify by kp compilation  - 6th version - 3.30

* update. test=develop

* update. test=develop

* update. test=develop

* update optimizer kernel

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* used by minxu

* update heter_comm_inl

* fix. test=develop

* Adapt XPUPS - modify by kp compilation  - 7th version - 3.30

* fix. test=develop

* add optimizer kernel. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* 3.31 update

* Adapt XPUPS - update kp compilation path  - 8th version - 3.31

* add optimizer kernel. test=develop

* fix kunlun not support size_t. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix kunlun not support size_t. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update heter_comm_kernel.kps 3.31

* fix. test=develop

* fix. test=develop

* update heter_comm_kernel.kps 3.31

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update heter_comm.h 3.31

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update hashtable. test=develop

* update. test=develop

* Adapt XPUPS - update by kp compilation  - 9th version - 4.1

* update hashtable. test=develop

* fix. test=develop

* update hashtable 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 10th version - 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update. test=develop

* modify by compilation 4.1

* update. test=develop

* update. test=develop

* fix. test=develop

* modify by compilation 4.1

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* modify by compilation 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* modify by compilation 4.1 19:30

* fix. test=develop

* update ps_gpu_wrapper.kps 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 11th version - 4.1

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 12nd version - 4.2

* fix. test=develop

* fix. test=develop

* modify by compilation 4.2

* 4.2 update

* fix. test=develop

* template init. test=develop

* update 4.6

* fix. test=develop

* template init. test=develop

* 4.6 modify by compilation

* hashtable template init. test=develop

* hashtable template init. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=devlop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=devlop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 13nd version - 4.7

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* 4.11 update

* fix. test=develop

* fix. test=develop

* 4.11 update

* update by pre-commit

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* 4.12 update

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 14th version - 4.13

* 4.13 update

* 4.14 update

* 4.14 update

* 4.14 update

* 4.14 modify by merged latest compilation

* retry CI 4.14

* 4.15 pass static check

* 4.15 modify by gpups CI

* 3.16 update by gpups CI - modify ps_gpu_wrapper.h

* 4.16 update

* 4.16 pass xpu compile

* 4.16 retry CI

* 4.16 update

* Adapt XPUPS - adapt BKCL comm for XPUPS - 4.24

* update by compilation

* Adapt XPUPS - register PSGPUTrainer for XPUPS - 4.25

* update device_worker_factory

* Adapt XPUPS - split heter_ps into .cu and .cc - 4.27

* Adapt XPUPS - register pull_box_sparse op under XPU_KP - 4.28

* update

* 5.7 modify ps_gpu_wrapper pull_sparse

* 5.11 update ps_gpu_wrapper CopyKeysKernel

* 5.13 modify calc_shard_offset_kernel & fill_shard_key_kernel

* modify fill_dvals_kernel & PullCopy & c_sync_calc_stream - 5.18

* modify PushCopy & fill_shard_grads_kernel & register push_box_sparse - 5.19

Co-authored-by: zmxdream <zhangminxu01@baidu.com>
---
 .../fleet/heter_ps/heter_comm_kernel.kps      | 34 ++++----
 .../fluid/framework/fleet/ps_gpu_wrapper.kps  | 58 ++++++++-----
 .../collective/c_sync_calc_stream_op.cc       | 67 +--------------
 .../collective/c_sync_calc_stream_op.h        | 83 +++++++++++++++++++
 .../collective/c_sync_calc_stream_op_xpu.cc   | 20 +++++
 .../platform/device/xpu/xpu_op_kpfirst_list.h |  2 +
 6 files changed, 163 insertions(+), 101 deletions(-)
 create mode 100644 paddle/fluid/operators/collective/c_sync_calc_stream_op.h
 create mode 100644 paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc

diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
index f73757902fef6..b44ea1807fd65 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
@@ -18,6 +18,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_XPU_KP)
 #include <xpu/runtime.h>
 #include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"  // NOLINT
 #include "xpu/kernel/math.h"
 #include "xpu/kernel/simd.h"
 #endif
@@ -91,7 +92,7 @@ __global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
-    for (int k = 0; k < read_len; k++) {
+    for (int k = 0; k < read_len - 1; k++) {
       if (local_idx[k] != local_idx[k + 1]) {
         int real_idx = i + k;
         local_right[local_idx[k]] = real_idx;
@@ -102,7 +103,7 @@ __global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
       local_left[local_idx[i]] = i;
     }
     if (i + read_len == len) {
-      local_right[local_idx[len - 1]] = len - 1;
+      local_right[local_idx[read_len - 1]] = len - 1;
     }
   }
   // to be optimized: call LM2GM too frequently
@@ -150,7 +151,7 @@ __global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
   const int buf_size = 400;
-  __local__ KeyType local_keys[buf_size];
+  // __local__ KeyType local_keys[buf_size];
   __local__ KeyType local_shard_keys[buf_size];
   __local__ T local_idx[buf_size];
   int len_per_loop = min(buf_size, roundup_div(len, nthreads));
@@ -158,10 +159,11 @@ __global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
        i += nthreads * len_per_loop) {
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
-    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    // GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     for (int k = 0; k < read_len; k++) {
-      local_shard_keys[k] = local_keys[local_idx[k]];
+      GM2LM(d_keys + local_idx[k], &local_shard_keys[k], 1 * sizeof(KeyType));
+      // local_shard_keys[k] = local_keys[local_idx[k]];
     }
     LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
   }
@@ -181,9 +183,9 @@ __global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
 
-  const int buf_size = 100;
-  __local__ KeyType local_keys[buf_size];
-  __local__ GradType local_grads[buf_size];
+  const int buf_size = 50;
+  // __local__ KeyType local_keys[buf_size];
+  // __local__ GradType local_grads[buf_size];
   __local__ KeyType local_shard_keys[buf_size];
   __local__ GradType local_shard_grads[buf_size];
   __local__ T local_idx[buf_size];
@@ -193,12 +195,15 @@ __global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
        i += nthreads * len_per_loop) {
     // read batch from GM will boost performance
     int read_len = min(len_per_loop, len - i);
-    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
-    GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
+    // GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    // GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     for (int k = 0; k < read_len; k++) {
-      local_shard_keys[k] = local_keys[local_idx[k]];
-      local_shard_grads[k] = local_grads[local_idx[k]];
+      GM2LM(d_keys + local_idx[k], &local_shard_keys[k], 1 * sizeof(KeyType));
+      GM2LM(d_grads + local_idx[k], &local_shard_grads[k],
+            1 * sizeof(GradType));
+      // local_shard_keys[k] = local_keys[local_idx[k]];
+      // local_shard_grads[k] = local_grads[local_idx[k]];
     }
     LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
     LM2GM(local_shard_grads, d_shard_grads + i, read_len * sizeof(GradType));
@@ -227,9 +232,10 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
     GM2LM(idx + i, local_idx, read_len * sizeof(T));
     GM2LM(d_shard_vals + i, local_shard_vals, read_len * sizeof(ValType));
     for (int k = 0; k < read_len; k++) {
-      local_vals[local_idx[k]] = local_shard_vals[k];
+      LM2GM(&local_shard_vals[k], d_vals + local_idx[k], 1 * sizeof(ValType));
+      // local_vals[local_idx[k]] = local_shard_vals[k];
     }
-    LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
+    // LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
   }
 }
 
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index 58b9f0f722f8c..ef6c70e624d4c 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -28,9 +28,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-__global__ void PullCopy(float** dest, const FeatureValue* src,
+__global__ void PullCopy(float* dest, const FeatureValue* src,
                          const long long* len, int hidden, int slot_num,
-                         int total_len, unsigned long long** keys) {
+                         int total_len, unsigned long long* keys) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -41,11 +41,21 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   __local__ int64_t local_len[slot_num];
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
 
+  __global_ptr__ unsigned long long* local_keys[slot_num];
+  GM2LM(keys, local_keys,
+        slot_num * sizeof(__global_ptr__ unsigned long long*));
+
+  __global_ptr__ float* local_dest[slot_num];
+  GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*));
+
+  int read_len = 30;
+
   for (int i = thread_id; i < slot_num; i += nthreads) {
     // max core local memory = 8KB
     // slot's max memory size = slot_len * sizeof(FeatureValue)
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
-    int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)), slot_len);
+    // int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)),
+    // slot_len);
     int dest_len = i ? local_len[i - 1] : 0;
     __local__ FeatureValue local_slot_vals[read_len];
     __local__ float local_dest_vals[read_len * hidden];
@@ -56,7 +66,8 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
       int real_read_len = min(read_len, slot_len - k);
       GM2LM(src + dest_len + k, local_slot_vals,
             real_read_len * sizeof(FeatureValue));
-      GM2LM(keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      GM2LM(local_keys[i] + k, local_slot_keys,
+            real_read_len * sizeof(uint64_t));
       for (int j = 0; j < real_read_len; j++) {
         if (local_slot_keys[j] == 0) {
           local_dest_vals[j * hidden] = 0;
@@ -78,7 +89,7 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
           }
         }
       }
-      LM2GM(local_dest_vals, dest[i] + k * hidden,
+      LM2GM(local_dest_vals, local_dest[i] + k * hidden,
             real_read_len * hidden * sizeof(float));
     }
   }
@@ -120,7 +131,7 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
   }
 }
 
-__global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
+__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
                          int hidden, int slot_num, int total_len, int bs,
                          int* slot_vector) {
   int cid = core_id();
@@ -135,12 +146,16 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
   GM2LM(len, local_len, slot_num * sizeof(int64_t));
   GM2LM(slot_vector, local_slot, slot_num * sizeof(int));
 
+  __global_ptr__ float* local_src[slot_num];
+  GM2LM(src, local_src, slot_num * sizeof(__global_ptr__ float*));
+
   for (int i = thread_id; i < slot_num; i += nthreads) {
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
 
     // max core local memory = 8KB
     // slot's max memory size = slot_len * hidden * 8
-    int read_len = min(roundup_div(1024, hidden), slot_len);
+    // int read_len = min(roundup_div(1024, hidden), slot_len);
+    int read_len = 40;
     int dest_len = i ? local_len[i - 1] : 0;
     __local__ float local_slot_grads[read_len * hidden];
     __local__ FeaturePushValue local_dest_grads[read_len];
@@ -148,7 +163,7 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
     // copy read_len(length) of slots' grad to LM
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src[i] + k * hidden, local_slot_grads,
+      GM2LM(local_src[i] + k * hidden, local_slot_grads,
             real_read_len * hidden * sizeof(float));
       // copy from slots' grad to total grad
       for (int j = 0; j < real_read_len; j++) {
@@ -181,14 +196,18 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
   stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
                ->x_context()
                ->xpu_stream;
-  float* buf_value = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&buf_value),
+  // float* buf_value = nullptr;
+  // xpu_malloc(reinterpret_cast<void**>(&buf_value),
+  //            values.size() * sizeof(float*));
+  // float** gpu_values = reinterpret_cast<float**>(&buf_value);
+  float* gpu_values = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
              values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(&buf_value);
   xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
              XPU_HOST_TO_DEVICE);
 
-  unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  // unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  unsigned long long* c_keys = reinterpret_cast<unsigned long long*>(gpu_keys);
   const long long* c_len = (const long long*)gpu_len;
   PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
                               slot_num, total_length, c_keys);
@@ -230,20 +249,17 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
     slot_lengths_lod[i] += slot_lengths_lod[i - 1];
   }
 
-  float* buf_grad_value = nullptr;
-  int64_t* buf_length = nullptr;
-  int* buf_slot_vector = nullptr;
+  float* gpu_values = nullptr;
+  int64_t* gpu_len = nullptr;
+  int* d_slot_vector = nullptr;
 
-  xpu_malloc(reinterpret_cast<void**>(&buf_grad_value),
+  xpu_malloc(reinterpret_cast<void**>(&gpu_values),
              grad_values.size() * sizeof(float*));
-  xpu_malloc(reinterpret_cast<void**>(&buf_length),
+  xpu_malloc(reinterpret_cast<void**>(&gpu_len),
              slot_lengths.size() * sizeof(int64_t));
-  xpu_malloc(reinterpret_cast<void**>(&buf_slot_vector),
+  xpu_malloc(reinterpret_cast<void**>(&d_slot_vector),
              slot_lengths_lod.size() * sizeof(int));
 
-  float** gpu_values = reinterpret_cast<float**>(&buf_grad_value);
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length);
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector);
   xpu_memcpy(gpu_values, grad_values.data(),
              grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
   xpu_memcpy(gpu_len, slot_lengths_lod.data(),
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 088366dbc8f69..6ad22ff8b19eb 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -11,27 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
 
 namespace paddle {
 namespace operators {
 
-class CSyncCalcStreamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(framework::proto::VarType::FP32,
-                                   ctx.GetPlace());
-  }
-};
-
 class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() {
@@ -45,53 +29,6 @@ Call calculation stream synchronization.
   }
 };
 
-template <typename T>
-class CSyncCalcStreamKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-
-    auto place = ctx.GetPlace();
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-
-    platform::GpuStreamSync(dev_ctx->stream());
-
-#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on npu place only for now."));
-
-    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::NPUStreamSync(dev_ctx->stream());
-
-#elif defined(PADDLE_WITH_CNCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on mlu place only for now."));
-
-    auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    platform::MLUStreamSync(dev_ctx->stream());
-#elif defined(PADDLE_WITH_XPU_BKCL)
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on xpu place only for now."));
-
-    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-    dev_ctx->Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -105,5 +42,3 @@ REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
 
 REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
-
-REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
new file mode 100644
index 0000000000000..b07367f801fa3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CSyncCalcStreamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSyncCalcStreamKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+
+    auto place = ctx.GetPlace();
+    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+
+    platform::GpuStreamSync(dev_ctx->stream());
+
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    platform::NPUStreamSync(dev_ctx->stream());
+
+#elif defined(PADDLE_WITH_CNCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_mlu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on mlu place only for now."));
+
+    auto dev_ctx = static_cast<platform::MLUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    platform::MLUStreamSync(dev_ctx->stream());
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on xpu place only for now."));
+
+    auto dev_ctx = static_cast<platform::XPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    dev_ctx->Wait();
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
new file mode 100644
index 0000000000000..04a83ea64f076
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_xpu.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_sync_calc_stream_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>)
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index ab68ebf3a5448..778c18146d64d 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -109,6 +109,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"pull_box_sparse",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"push_box_sparse",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };

From de735a9a819cd2c53d115e99b25a422ede0614d9 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Tue, 24 May 2022 15:05:14 +0800
Subject: [PATCH 023/109] fix cmake command, rm -> remove (#42927)

---
 paddle/phi/api/lib/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 7d28e3d27c496..004ed8de520d9 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -146,7 +146,7 @@ elseif(EXISTS "${generated_op_path}.tmp")
   execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" "${generated_op_path}")
   message("copy ${generated_op_path}.tmp ${generated_op_path}")
 else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${generated_op_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}")
   message("remove ${generated_op_path}")
 endif()
 
@@ -158,7 +158,7 @@ elseif(EXISTS "${generated_argument_mapping_path}.tmp")
   execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" "${generated_argument_mapping_path}")
   message("copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}")
 else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${generated_argument_mapping_path}")
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_argument_mapping_path}")
   message("remove ${generated_argument_mapping_path}")
 endif()
 

From d4cdfa55cbe682d54993445773d689024fbcdafd Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Tue, 24 May 2022 15:39:33 +0800
Subject: [PATCH 024/109] [Yaml]add pad/pad3d/squeeze/unsqueeze yaml and test
 case (#42774)

* add pad3d_double_grad yaml and test case

* add squeeze and unsqueeze double grad

* add double grad config

* add pad_grad and pad_double_grad yaml

* add pad_double_grad in config
---
 .../final_state_generator/codegen_utils.py    |  3 +-
 .../fluid/tests/unittests/test_nn_grad.py     | 18 +++++++
 python/paddle/utils/code_gen/api.yaml         |  2 +-
 python/paddle/utils/code_gen/backward.yaml    | 49 ++++++++++++++++++-
 4 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 9849dc48fc490..786dd0e3bfc18 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -29,7 +29,8 @@
     "tanh_double_grad", "tanh_triple_grad", "subtract_double_grad",
     "divide_double_grad", "log_double_grad", "elu_double_grad",
     "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
-    "square_double_grad", "celu_double_grad"
+    "square_double_grad", "celu_double_grad", "pad_double_grad",
+    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index 3a100cd321e03..4685b00b394b7 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -215,6 +215,10 @@ def test_grad(self):
 
 
 class TestSqueezeDoubleGradCheck(unittest.TestCase):
+    def squeeze_warpper(self, x):
+        axes = [0, 2]
+        return paddle.squeeze(x[0], axes)
+
     @prog_scope()
     def func(self, place):
         x_shape = [1, 3, 1, 40]
@@ -229,6 +233,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.squeeze_warpper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -239,6 +245,10 @@ def test_grad(self):
 
 
 class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+    def unsqueeze_wrapper(self, x):
+        axes = [1, 2]
+        return paddle.unsqueeze(x[0], axes)
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 40]
@@ -253,6 +263,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.unsqueeze_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -333,6 +345,10 @@ def test_grad(self):
 
 
 class TestConstantPadDoubleGradCheck(unittest.TestCase):
+    def pad_wrapper(self, x):
+        pad = [1, 1, 1, 1]
+        return paddle.nn.functional.pad(x[0], pad)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -347,6 +363,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.pad_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 1a740f47f46f5..f9e0efa59500d 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1556,7 +1556,7 @@
     func : PadInferMeta
   kernel :
     func : pad
-  # backward : pad_grad
+  backward : pad_grad
 
 - api : pad3d
   args : (Tensor x, IntArray paddings, str mode,  float pad_value, str data_format)
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 19343c5873db6..eb00e2e615f67 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1338,6 +1338,15 @@
   kernel :
     func : p_norm_grad
 
+- backward_api : pad3d_double_grad
+  forward : pad3d_grad(Tensor x, Tensor grad_out, IntArray paddings, str mode, float pad_value, str data_format) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray paddings, str mode, float pad_value, str data_format)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : Pad3dInferMeta
+  kernel :
+    func : pad3d
+
 - backward_api : pad3d_grad
   forward : pad3d(Tensor x, IntArray paddings, str mode,  float pad_value, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray paddings, str mode,  float pad_value, str data_format)
@@ -1348,6 +1357,29 @@
   kernel :
     func : pad3d_grad
   no_need_buffer : x
+  backward : pad3d_double_grad
+
+- backward_api : pad_double_grad
+  forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, float pad_value) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] paddings, float pad_value)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : PadInferMeta
+  kernel :
+    func : pad
+
+- backward_api : pad_grad
+  forward : pad(Tensor x, int[] paddings, float pad_value) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int[] paddings, float pad_value)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : pad_grad
+    param: [out_grad, paddings, pad_value]
+  no_need_buffer : x
+  backward : pad_double_grad
 
 - backward_api : pixel_shuffle_grad
   forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out)
@@ -1813,6 +1845,12 @@
     func : square_grad
   backward : square_double_grad
 
+- backward_api : squeeze_double_grad
+  forward : squeeze_grad(Tensor xshape, Tensor grad_out, int[] axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] axes)
+  output : Tensor(grad_out_grad)
+  invoke: squeeze(grad_x_grad, axes)
+
 - backward_api : squeeze_grad
   forward : squeeze(Tensor x, int[] axes) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad, int[] axes)
@@ -1823,6 +1861,7 @@
   kernel :
     func : squeeze_grad
   inplace : (out_grad -> x_grad)
+  backward: squeeze_double_grad
 
 - backward_api : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
@@ -2085,16 +2124,24 @@
     func : unfold_grad
   no_need_buffer : x
 
+- backward_api : unsqueeze_double_grad
+  forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray axes)
+  output : Tensor(grad_out_grad)
+  invoke : unsqueeze(grad_x_grad, axes)
+
 - backward_api : unsqueeze_grad
   forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad)
+  args : (Tensor xshape, Tensor out_grad, IntArray axes)
   output : Tensor(x_grad)
   infer_meta :
     func : KernelWithXShapeInferMeta
     param: [xshape]
   kernel :
     func : unsqueeze_grad
+    param: [xshape, out_grad]
   inplace : (out_grad -> x_grad)
+  backward : unsqueeze_double_grad
 
 - backward_api : where_grad
   forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)

From b5ec9ca0cbf3f2fc0fd19a9b8159469855ce0c8d Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Tue, 24 May 2022 16:30:02 +0800
Subject: [PATCH 025/109] upgrade to sdk2.5.1 (#42950)

* upgrade to sdk2.5.1
---
 paddle/fluid/platform/device/ipu/ipu_backend.cc      |  6 +-----
 paddle/fluid/platform/device/ipu/ipu_executor.cc     | 12 +++++++-----
 paddle/fluid/platform/device/ipu/ipu_executor.h      |  3 +++
 .../paddle/fluid/tests/unittests/ipu/op_test_ipu.py  |  5 +++++
 tools/dockerfile/Dockerfile.ipu                      |  2 +-
 5 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 0871624a5d749..9e960a99123c0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -74,11 +74,7 @@ void IpuBackend::WeightsToHost() { executor_->WeightsToHost(); }
 
 void IpuBackend::Detach() { executor_->Detach(); }
 
-void IpuBackend::Reset() {
-  executor_->Detach();
-  compiler_.reset();
-  executor_.reset();
-}
+void IpuBackend::Reset() { executor_->Reset(); }
 
 void IpuBackend::SetScope(const framework::Scope& scope) {
   scope_ = &scope;
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 4f15ecf3babf2..d490334ee33f5 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -88,11 +88,7 @@ class PdIArray final : public popart::IArray {
 
 }  // namespace
 
-Executor::~Executor() {
-  Detach();
-  session_.reset();
-  executor_resources_.reset();
-}
+Executor::~Executor() { Reset(); }
 
 void Executor::Prepare(const std::string &proto) {
   VLOG(10) << "enter Executor::Prepare";
@@ -299,6 +295,12 @@ void Executor::Detach() {
   }
 }
 
+void Executor::Reset() {
+  Detach();
+  session_.reset();
+  executor_resources_.reset();
+}
+
 void Executor::SetWeightsIO() {
   auto opt_type = compiler_resources_->optimizer_type;
   VLOG(10) << "SetWeightsIO for " << opt_type;
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index 70c9477e69bab..1a46ebc69b197 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -63,6 +63,9 @@ class Executor {
   // Detach IPU
   void Detach();
 
+  // Reset session
+  void Reset();
+
   // Scope
   void SetScope(const Scope *scope) { scope_ = scope; }
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 2583d9409a0a7..ad11083b67773 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -128,6 +128,11 @@ def setUpClass(cls):
         cls.fetch_list: List[str] = None
         cls.output_dict: Optional[Dict] = {}
 
+    def tearDown(self):
+        # Manual reset when using ipumodel
+        if self.use_ipumodel():
+            paddle.framework.core.IpuBackend.get_instance().reset()
+
     @property
     def fp16_enabled(self):
         return True
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index ee2d984035624..8f1948de8a4dc 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
+FROM graphcore/poplar:2.5.1
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables

From 9e5acc1faebd0ab8f03f7e8b82fac29c63de3464 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Tue, 24 May 2022 10:30:23 +0200
Subject: [PATCH 026/109] updated paddle_bfloat to v0.1.7 (#42865)

---
 python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index e7fc6cd651cb0..4192c6b3d777a 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -5,5 +5,5 @@ Pillow
 six
 decorator
 astor
-paddle_bfloat==0.1.2
+paddle_bfloat==0.1.7
 opt_einsum==3.3.0

From 4d7a9eef4237c2780ca5799805c74dcf90b3ceb8 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Tue, 24 May 2022 19:21:44 +0800
Subject: [PATCH 027/109] [Phi]Move grad_add op kernel into phi and delete
 elementwise_add_op file (#42903)

* move grad_add

* fix unittest bugs

* fix compile bugs
---
 paddle/fluid/operators/dgc_op.h               | 16 +++--
 .../elementwise/elementwise_add_op.cc         | 13 ----
 .../elementwise/elementwise_add_op.h          | 66 -------------------
 .../elementwise/elementwise_add_op.kps        | 61 -----------------
 .../elementwise/elementwise_add_op_npu.cc     |  1 -
 .../elementwise/elementwise_add_op_xpu.cc     |  1 -
 paddle/fluid/operators/fused/attn_gemm.h      |  8 +--
 paddle/fluid/operators/fused/fmha_ref.h       | 10 +--
 .../operators/fused/fused_attention_op.cu     |  8 +--
 .../operators/fused/fused_feedforward_op.cu   |  8 +--
 .../fused/fused_multi_transformer_op.cu       |  1 -
 .../phi/kernels/cpu/elementwise_add_kernel.cc | 20 ++++++
 .../phi/kernels/kps/elementwise_add_kernel.cu | 22 +++++++
 paddle/phi/ops/compat/elementwise_sig.cc      |  6 ++
 14 files changed, 74 insertions(+), 167 deletions(-)
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.h
 delete mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.kps

diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index ac9c440076257..b1bf5e2778167 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -15,9 +15,11 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "dgc/dgc.h"
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -153,18 +155,18 @@ class DGCOpKernel : public framework::OpKernel<T> {
       u_out_e.device(eigen_ctx) = m * (u_e + grad_out_e);
 
       // v = u + v + g
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
 
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, g, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, g, v, 0, phi::funcs::AddFunctor<T>(), v_out);
     } else {
       // u = m * u + g
       u_out_e.device(eigen_ctx) = m * u_e + grad_out_e;
 
       // v = u + v
-      ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(
-          ctx, u, v, 0, AddFunctor<T>(), v_out);
+      ElementwiseComputeEx<phi::funcs::AddFunctor<T>, DeviceContext, T>(
+          ctx, u, v, 0, phi::funcs::AddFunctor<T>(), v_out);
     }
 
     T* v_out_data = v_out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 53037c1fa6536..ed9b98a128a21 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-
 #include <string>
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
@@ -125,17 +123,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(
-    grad_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>);
-
 REGISTER_OP_VERSION(elementwise_add)
     .AddCheckpoint(
         R"ROC(Register elementwise_add for adding the attribute of
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
deleted file mode 100644
index d77d4ed036394..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef __xpu__
-#include <memory>
-#include <string>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_xpu.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#else
-#include <algorithm>
-#include <utility>
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-// only can include the headers in paddle/phi/include dirs
-#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ElementwiseAddKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef __xpu__
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    const auto& xpu_ctx =
-        ctx.template device_context<paddle::platform::XPUDeviceContext>();
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T, kps::AddFunctor<T>, 1>(
-        xpu_ctx, ins, &outs, axis, kps::AddFunctor<T>());
-#else
-    auto *x = ctx.Input<framework::LoDTensor>("X");
-    auto *y = ctx.Input<framework::LoDTensor>("Y");
-    auto *z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.device_context<DeviceContext>();
-    int axis = ctx.Attr<int>("axis");
-    phi::AddRawKernel<T>(
-        static_cast<const typename framework::ConvertToPhiContext<
-            DeviceContext>::TYPE &>(dev_ctx),
-        *x, *y, axis, z);
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
deleted file mode 100644
index ecd52a310acdb..0000000000000
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU_KP
-
-// Please do not modify the following code
-#if defined(__CUDA_ARCH__)
-#undef __CUDA_ARCH__
-#endif
-
-#if defined(__CUDACC__)
-#undef __CUDACC__
-#endif
-
-#if defined(__CUDA__)
-#undef __CUDA__
-#endif
-
-#if defined(__NVCC__)
-#undef __NVCC__
-#endif
-
-#include <xpu/runtime.h>                // NOLINT
-#include "xpu/kernel/cluster_header.h"  // NOLINT
-#include "xpu/kernel/debug.h"           // NOLINT
-#include "xpu/kernel/math.h"            // NOLINT
-
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#else
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise_grad.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace,
-                   ops::ElementwiseAddKernel<plat::XPUDeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::bfloat16>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<float>>,
-    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::complex<double>>);
-#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index d35e3f6641b45..178aa329577b7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index feb73abf3ff08..22a5de4c60941 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index c4e73c6bf97fd..9542f0742ea34 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -14,9 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
 namespace operators {
@@ -67,9 +68,8 @@ class AttnMatMul {
       ins.emplace_back(bias);
       outs.emplace_back(bias_out);
       int elewise_add_axis = -1;
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 3d75d127ab60a..0e9fba73933b7 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -12,12 +12,12 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/dropout_impl.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/fused/fused_softmax_mask.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
@@ -160,9 +160,9 @@ class FMHARef {
         ins.emplace_back(src_mask_tensor);
         outs.emplace_back(src_mask_out_tensor);
         int elewise_add_axis = -1;
-        paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                       T, T>(
-            dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor<T>());
+        phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+            dev_ctx_, ins, &outs, elewise_add_axis,
+            phi::funcs::AddFunctor<T>());
 
         phi::SoftmaxForwardCUDAKernelDriver<T>(
             dev_ctx_, *src_mask_out_tensor, softmax_axis, softmax_out_tensor);
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index d26577f06fe68..ec8a4d962e808 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
@@ -543,10 +544,9 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     ins.emplace_back(d_x);
     outs.emplace_back(d_x);
     int elewise_add_axis = -1;
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
         ctx.cuda_device_context(), ins, &outs, elewise_add_axis,
-        AddFunctor<T>());
+        phi::funcs::AddFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index c38d9f7d4bcbd..2eb9885286dab 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -17,9 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/matmul_v2_op.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -345,9 +346,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     ins[1] = d_x;
     outs[0] = d_x;
     int elewise_add_axis = -1;
-    paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
-                                                   T>(
-        ctx, ins, &outs, elewise_add_axis, AddFunctor<T>());
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        ctx, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index fdd0208c3d316..fe93d323c59bc 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
index 6070264547249..5019b9f570628 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -34,6 +34,14 @@ void AddKernel(const Context& dev_ctx,
   AddRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernel<T>(dev_ctx, x, y, out);
+}
+
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -65,3 +73,15 @@ PD_REGISTER_KERNEL(add,
                    int64_t,
                    complex64,
                    complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index 8f7d45771d9d0..98e39ada32b8b 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -33,6 +33,14 @@ void AddKernel(const Context& dev_ctx,
   AddRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernel<T>(dev_ctx, x, y, out);
+}
+
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
@@ -71,4 +79,18 @@ PD_REGISTER_KERNEL(add,
                    phi::dtype::bfloat16,
                    complex64,
                    complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
 #endif
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index c760c966b0647..17fb1858373d9 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -25,6 +25,11 @@ KernelSignature ElementwiseAddOpArgumentMapping(
   return KernelSignature("add_raw", {"X", "Y"}, {"axis"}, {"Out"});
 }
 
+KernelSignature ElementwiseGradAddOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("grad_add", {"X", "Y"}, {}, {"Out"});
+}
+
 KernelSignature ElementwiseSubOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   int axis = paddle::any_cast<int>(ctx.Attr("axis"));
@@ -317,3 +322,4 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad,
                            phi::ElementwiseHeavisideGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad,
                            phi::ElementwisePowGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(grad_add, phi::ElementwiseGradAddOpArgumentMapping);

From 07dab9da12231b09271e0f05057458f391d948e4 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 24 May 2022 22:21:52 +0800
Subject: [PATCH 028/109] fix namespace parser in eager_code_gen (#42957)

---
 .../final_state_generator/codegen_utils.py                    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 786dd0e3bfc18..8467a6d7dfb6a 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -447,7 +447,7 @@ def ParseForwardYamlContents(self):
 
     def InferNameSpace(self):
         api_yaml_path = self.api_yaml_path
-        if "sparse" in api_yaml_path:
+        if re.search(r"sparse[a-zA-Z0-9_]*\.yaml", api_yaml_path):
             self.namespace = "sparse::"
-        elif "strings" in api_yaml_path:
+        elif re.search(r"strings[a-zA-Z0-9_]*\.yaml", api_yaml_path):
             self.namespace = "strings::"

From 53e503830cff4b3bcf00e99c8e368ffc62a115d7 Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Wed, 25 May 2022 09:57:32 +0800
Subject: [PATCH 029/109] [Dygraph]fix_sharding3_offload (#42955)

* fix_sharding3_offload

* fix_fp16dtype_bug
---
 .../sharding/group_sharded_stage3.py          | 19 ++++++---
 .../meta_parallel/sharding/sharding_stage3.py | 40 ++++++++++---------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index 049d3ffa3694f..e44b5d2515d83 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -205,7 +205,7 @@ def _clear_gradients(self):
             for param in list(self._unslice_params):
                 param.clear_gradient(False)
                 tmp_var = param.cuda(DEV_ID)
-                param._clear_data()
+
                 if tmp_var.dtype == Type.fp32.value and param2dtype[
                         param.name] == Type.fp16.value:
                     tmp_var = paddle.cast(tmp_var, Type.fp16.value)
@@ -272,6 +272,8 @@ def _handle_unslice_params(self):
                 master_tensor = paddle.cast(param, Type.fp32.value)
                 master_tensor.name = param.name
                 self._optim._master_weights[param.name] = master_tensor
+            if self._offload:
+                param.master_weight = paddle.cast(param, Type.fp32.value).cpu()
             param2dtype[param.name] = param.dtype
             p_align = self._param2align(param)
             self._unslice_params2align[param.name] = p_align
@@ -369,7 +371,6 @@ def _param_storage(self, param, buffer_size):
         tmp_var.get_tensor().set(param_cpu.get_tensor(), core.CPUPlace())
         del tmp_var
         param.get_tensor()._set_dims(param_shape)
-        param._clear_data()
 
         # Current rank param_storage
         if self._offload:
@@ -379,6 +380,9 @@ def _param_storage(self, param, buffer_size):
                 value=tmp_tensor,
                 place=core.CPUPlace(),
                 name="slice@" + param.name)
+            with device_guard():
+                param.master_weight = paddle.cast(param.fw_storage,
+                                                  Type.fp32.value)
         else:
             param.fw_storage = core.eager.Tensor(
                 value=buffer._slice(start, end), name="slice@" + param.name)
@@ -389,6 +393,7 @@ def _param_storage(self, param, buffer_size):
             master_tensor = paddle.cast(param.fw_storage, Type.fp32.value)
             master_tensor.name = param.name
             self._optim._master_weights[param.fw_storage.name] = master_tensor
+        param._clear_data()
 
     def _register_forward_hooks(self, layer):
         """
@@ -480,9 +485,8 @@ def _update_params(self):
             collective.all_reduce(tensor=grad_storage.buffer, group=self._group)
         if self._offload:
             for param in list(self._unslice_params):
-                tmp_var = _device2cpu(param, convert_dtype=True)
-                tmp_var._share_buffer_to(param)
-                del tmp_var
+                param._clear_data()
+                param.master_weight._share_buffer_to(param)
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
@@ -568,7 +572,8 @@ def allreduce_(*_):
                     del self._task_flow.full_param[param.name]
 
                     if self._offload:
-                        param.fw_storage = _device2cpu(param.fw_storage, True)
+                        param.fw_storage._clear_data()
+                        param.master_weight._share_buffer_to(param.fw_storage)
 
         return allreduce_
 
@@ -856,6 +861,7 @@ def _PartitionParam(param):
     if not hasattr(param, "fw_storage"):
         setattr(param, "fw_storage", None)
         setattr(param, "bw_storage", None)
+        setattr(param, "master_weight", None)
         setattr(param, "status", "all")
         setattr(param, "use_count", 0)
     return param
@@ -864,6 +870,7 @@ def _PartitionParam(param):
 def _UnsliceParam(param):
     if not hasattr(param, "unslice"):
         setattr(param, "unslice", True)
+        setattr(param, "master_weight", None)
     return param
 
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index f96273cc84caf..7bb1517f12169 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -199,7 +199,7 @@ def _clear_gradients(self):
                 param.clear_gradient(False)
                 param._gradient_set_empty(False)
                 tmp_var = param.cuda(DEV_ID)
-                param._clear()
+
                 if tmp_var.dtype == Type.fp32.value and param2dtype[
                         param.name] == Type.fp16.value:
                     tmp_var = paddle.cast(tmp_var, Type.fp16.value)
@@ -220,19 +220,14 @@ def _update_params_slice(self):
             self._optim._param_groups = slice_params + list(
                 self._unslice_params)
         else:
-            params_name_list = list(map(lambda p: p.name, update_list))
-            fw_storage_name_list = list(
-                map(lambda p: p.fw_storage.name, update_list))
             for param_group in self._optim._param_groups:
                 p_group = []
                 for p in param_group['params']:
-                    if p.name in params_name_list:
+                    if hasattr(p, "fw_storage"):
                         p_group.append(p.fw_storage)
-                    elif p.name in fw_storage_name_list:
-                        p_group.append(update_list[fw_storage_name_list.index(
-                            p.name)].fw_storage)
-                    elif p in self._unslice_params:
+                    else:
                         p_group.append(p)
+
                 param_group['params'] = p_group
 
     def forward(self, *inputs, **kwargs):
@@ -268,6 +263,8 @@ def _handle_unslice_params(self):
             if param.dtype == Type.fp16.value and not self._offload:
                 self._optim._master_weights[param.name] = paddle.cast(
                     param, Type.fp32.value)
+            if self._offload:
+                param.master_weight = paddle.cast(param, Type.fp32.value).cpu()
             param2dtype[param.name] = param.dtype
             p_align = self._param2align(param)
             self._unslice_params2align[param.name] = p_align
@@ -335,11 +332,12 @@ def _add_manage_info(trainable_param):
                 self._param2buffer[param.name].append(
                     (rank_ * pre_buffer, (rank_ + 1) * pre_buffer))
 
-            # 3.Flatten layer params and release other rank buffer
-            self._param_storage(param, buffer_size)
             # Record param's dtype
             param2dtype[param.name] = param.dtype
 
+            # 3.Flatten layer params and release other rank buffer
+            self._param_storage(param, buffer_size)
+
     def _param_storage(self, param, buffer_size):
         """
         This is a function to simplify the handling of parameter InternalStorages.
@@ -365,13 +363,15 @@ def _param_storage(self, param, buffer_size):
         tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
                                          core.CPUPlace())
         param.value().get_tensor()._set_dims(param_shape)
-        param._clear()
 
         # Current rank param_storage
         if self._offload:
             param.fw_storage = core.VarBase(
                 buffer._slice(start, end),
                 core.CPUPlace(), "slice@" + param.name)
+            with device_guard(device="cpu"):
+                param.master_weight = paddle.cast(param.fw_storage,
+                                                  Type.fp32.value)
         else:
             param.fw_storage = core.VarBase(
                 buffer._slice(start, end), "slice@" + param.name)
@@ -381,6 +381,7 @@ def _param_storage(self, param, buffer_size):
         if param.dtype == Type.fp16.value and not self._offload:
             self._optim._master_weights[param.fw_storage.name] = paddle.cast(
                 param.fw_storage, Type.fp32.value)
+        param._clear()
 
     def _register_forward_hooks(self, layer):
         """
@@ -482,9 +483,8 @@ def _update_params(self):
 
         if self._offload:
             for param in list(self._unslice_params):
-                tmp_var = _device2cpu(param, convert_dtype=True)
-                tmp_var._share_buffer_to(param)
-                tmp_var._clear()
+                param._clear()
+                param.master_weight._share_buffer_to(param)
 
             for grad_storage in self._grad_storages.values():
                 for p in grad_storage._params:
@@ -553,8 +553,9 @@ def allreduce_(*_):
                         cpu_grad = _device2cpu(
                             core.VarBase(full_grad._slice(start, end))
                             .detach().clone(), True)
-                        param.bw_storage = paddle.add(param.bw_storage,
-                                                      cpu_grad)
+                        with device_guard(device="cpu"):
+                            param.bw_storage = paddle.add(param.bw_storage,
+                                                          cpu_grad)
                     else:
                         # param.bw_storage.add_(
                         #     core.VarBase(full_grad._slice(start, end))
@@ -581,7 +582,8 @@ def allreduce_(*_):
                     tmp_var._clear()
 
                     if self._offload:
-                        param.fw_storage = _device2cpu(param.fw_storage, True)
+                        param.fw_storage._clear()
+                        param.master_weight._share_buffer_to(param.fw_storage)
 
         return allreduce_
 
@@ -869,6 +871,7 @@ def _PartitionParam(param):
     if not hasattr(param, "fw_storage"):
         setattr(param, "fw_storage", None)
         setattr(param, "bw_storage", None)
+        setattr(param, "master_weight", None)
         setattr(param, "status", "all")
         setattr(param, "use_count", 0)
     return param
@@ -877,6 +880,7 @@ def _PartitionParam(param):
 def _UnsliceParam(param):
     if not hasattr(param, "unslice"):
         setattr(param, "unslice", True)
+        setattr(param, "master_weight", None)
     return param
 
 

From 4218957b202cedb7d52686f1ad555015e664f636 Mon Sep 17 00:00:00 2001
From: Zhangjingyu06 <92561254+Zhangjingyu06@users.noreply.github.com>
Date: Wed, 25 May 2022 09:59:13 +0800
Subject: [PATCH 030/109] modify xpu.cmake *test=kunlun (#42962)

---
 cmake/external/xpu.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index a3287d6bfd94e..43d5002fe3819 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220511")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220520")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

From e5fc68b2c34cc068274d33d127ecfda75e4ed4c2 Mon Sep 17 00:00:00 2001
From: Ming-Xu Huang <mingh@nvidia.com>
Date: Wed, 25 May 2022 10:17:27 +0800
Subject: [PATCH 031/109] Dynamic graph support to Automatic SParsity. (#41177)

* Dynamic graph support to Automatic SParsity.

1. Added dynamic support to ASP module (paddle.fluid.contrib.sparsity).
2. Added ASP related unit-tests regards to above changes.
3. Put ASP module under paddle.static for now, waiting for APIs confirmation from Paddle.

* Modified documents of functions to have correct examples.

* Update in_dygraph_mode to paddle.in_dynamic_mode()

* Modified documents of functions and added comments

* Minor changes.

* Fix example errors in asp API.

* Code Change for Review

1. Added more examples in documents.
2. Chaged test_asp_pruning_static.

* Minor changes

* Update ASP function documents.

* Update ASP function documents.

* Reduce test case size of asp pruning due CI time limit.

* Update time limitation to some asp UTs.

* Fix sample code errors.

* Fix sample code errors.

* Fix sample code errors.

* Update time limitation to parts of ASP UTs.

* Update UTs to fit with CI.

* Reduce problem size in python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py

* Added paddle.asp

* Fixed type casting error of OpRole.Optimize in new dygraph mode.

* Made set_excluded_layers be compatible with 2.2

* Fix example code of calculate_density.

* Update code examples.

* Move paddle.asp to paddle.incubate.asp

* Fixed an example error of calculate_density
---
 python/paddle/fluid/contrib/sparsity/asp.py   | 710 +++++++++++++-----
 python/paddle/fluid/contrib/sparsity/utils.py |   5 +-
 .../fluid/tests/unittests/asp/CMakeLists.txt  |  13 +-
 .../tests/unittests/asp/asp_pruning_base.py   |   5 +-
 .../asp/test_asp_customized_pruning.py        |  91 +++
 .../asp/test_asp_optimize_dynamic.py          | 175 +++++
 ...ptimize.py => test_asp_optimize_static.py} |  31 +-
 .../unittests/asp/test_asp_pruning_2d_best.py |  37 -
 .../asp/test_asp_pruning_2d_greedy.py         |  39 -
 .../unittests/asp/test_asp_pruning_dynamic.py | 107 +++
 .../unittests/asp/test_asp_pruning_static.py  | 111 +++
 .../tests/unittests/asp/test_asp_save_load.py | 175 +++++
 .../tests/unittests/asp/test_asp_utils.py     |   9 +-
 .../unittests/asp/test_fleet_with_asp.py      |  91 ---
 .../asp/test_fleet_with_asp_dynamic.py        | 156 ++++
 ...p_amp.py => test_fleet_with_asp_static.py} |  61 +-
 .../test_fleet_sharding_meta_optimizer.py     |   1 +
 python/paddle/incubate/__init__.py            |   1 +
 .../asp/__init__.py}                          |  35 +-
 python/paddle/static/sparsity/__init__.py     |   8 +-
 python/setup.py.in                            |   1 +
 21 files changed, 1463 insertions(+), 399 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
 rename python/paddle/fluid/tests/unittests/asp/{test_asp_optimize.py => test_asp_optimize_static.py} (89%)
 delete mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
 delete mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
 create mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
 create mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
 create mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
 delete mode 100644 python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
 create mode 100644 python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
 rename python/paddle/fluid/tests/unittests/asp/{test_fleet_with_asp_amp.py => test_fleet_with_asp_static.py} (67%)
 rename python/paddle/{fluid/tests/unittests/asp/test_asp_pruning_1d.py => incubate/asp/__init__.py} (51%)

diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 30439ad736d26..c366af7237d1b 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -20,12 +20,13 @@
 import copy
 import numpy as np
 import paddle
+from paddle.fluid.framework import dygraph_only
 from paddle.fluid import global_scope, program_guard, layers
 from paddle.fluid.initializer import ConstantInitializer
 from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
 from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map
 from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning
-from paddle.fluid import core
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -35,45 +36,90 @@
 ]
 
 
-def set_excluded_layers(main_program, param_names):
+def set_excluded_layers(param_names, main_program=None):
     r"""
     Set parameter name of layers which would not be pruned as sparse weights.
 
     Args:
+        param_names (list of string): A list contains names of parameters.
         main_program (Program, optional): Program with model definition and its parameters.
-        param_names (list): A list contains names of parameters.
+                                          If None is given, then it would be set as `paddle.static.default_main_program().
+                                          Default is None.
     Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.static import sparsity
-
-            paddle.enable_static()
-
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
-
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = paddle.static.amp.decorate(optimizer )
-                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
-                # will insert necessary masking operations for ASP workflow.
-                optimizer = sparsity.decorate(optimizer)
-                optimizer.minimize(loss, startup_program)
+        1. Usage of Dynamic Graph
+
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Need to set excluded layers before calling decorate
+                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        2. Usage of Static Graph
+
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    # Setup exluded layers out from ASP workflow.
+                    # Please note, excluded_layers must be set before calling optimizer.minimize().
+                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    optimizer = paddle.static.amp.decorate(optimizer )
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
+    if main_program is None:
+        main_program = paddle.static.default_main_program()
     ASPHelper.set_excluded_layers(
-        main_program=main_program, param_names=param_names)
+        param_names=param_names, main_program=main_program)
 
 
 def reset_excluded_layers(main_program=None):
@@ -83,153 +129,310 @@ def reset_excluded_layers(main_program=None):
 
     Args:
         main_program (Program, optional): Program with model definition and its parameters.
-        Examples:
-        .. code-block:: python
+                                          If None is given, then this function would reset all excluded_layers.
+                                          Default is None.
+    Examples:
+        1. Usage of Dynamic Graph
 
-            import paddle
-            from paddle.static import sparsity
+            .. code-block:: python
 
-            paddle.enable_static()
+                import paddle
 
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
 
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="my_first_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="my_second_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
 
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["my_second_fc"])
-                # Now the weights of "my_second_fc" would not be included in Automatic SParsity's workflow.
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Need to set excluded layers before calling decorate
+                paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()])
+                # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                # Please note, reset_excluded_layers also must be called before calling sparsity.decorate().
+                paddle.incubate.asp.reset_excluded_layers()
+
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        2. Usage of Static Graph
+
+            .. code-block:: python
 
-            # Reset excluded_layers, all FC layers would be included into Automatic SParsity's workflow.
-            # Please note, reset_excluded_layers also must be called before calling `optimizer.minimize()`.
-            sparsity.reset_excluded_layers(main_program)
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    # Setup exluded layers out from ASP workflow.
+                    # Please note, excluded_layers must be set before calling optimizer.minimize().
+                    paddle.incubate.asp.set_excluded_layers([my_layer.linear1.full_name()], main_program)
+                    # Reset excluded_layers, all supported layers would be included into Automatic SParsity's workflow.
+                    # Please note, reset_excluded_layers also must be called before calling optimizer.minimize().
+                    paddle.incubate.asp.reset_excluded_layers(main_program)
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    optimizer = paddle.static.amp.decorate(optimizer )
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
     ASPHelper.reset_excluded_layers(main_program=main_program)
 
 
 def decorate(optimizer):
     r"""
-    Wrap the given optimizer as a OptimizerWithSparsityGuarantee, 
-    which would insert necessary ops for ASP workflows when calling minimize()
+    Wrap the given optimizer as a OptimizerWithSparsityGuarantee,
+    If runnig with dynamic graph mode. ASP would creates mask variables for supported parameters.
+    Else if in static graph mode, ASP would creates mask variables and inserts necessary ops 
+    when calling minimize()
 
     Args:
         optimizer (Optimizer): A Optimizer used for training.
     Returns:
         OptimizerWithSparsityGuarantee: A wrapper for ASP to decorate `minimize` function of the given optimizer.
     Examples:
-        .. code-block:: python
+        1. Usage of Dynamic Graph
 
-            import paddle
-            from paddle.static import sparsity
+            .. code-block:: python
+
+                import paddle
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
 
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
 
-            paddle.enable_static()
+                my_layer = MyLayer()
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
 
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None)
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # will apply necessary masking operations for ASP workflow.
+                # In dynamic graph mode, ASP would create related mask variables during decoration.
+                optimizer = paddle.incubate.asp.decorate(optimizer)
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = sparsity.decorate(optimizer)
-                # if do sparse training with Fleet, please replace above decorate with:
-                # strategy = paddle.distributed.fleet.DistributedStrategy()
-                # strategy.asp = True
-                # optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        2. Usage of Static Graph
 
-                optimizer.minimize(loss, startup_program)
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 100)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        prediction = self.linear1(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 224, 224])
+                    label = paddle.static.data(name='label', shape=[None, 100])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    # In static graph mode, ASP creates related mask variables 
+                    # during minimize().
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
     """
     return ASPHelper.decorate(optimizer)
 
 
-def prune_model(main_program=None,
-                n=2,
-                m=4,
-                mask_algo='mask_1d',
-                with_mask=True):
+def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
     r"""
-    Pruning parameters of supported layers in :attr:`main_program` via 
+    Pruning parameters of supported layers in :attr:`model` via 
     specified mask generation function given by :attr:`mask_algo`. This 
     function supports both training and inference controlled by :attr:`with_mask`.
     If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
     else only prunes parameters.
 
-    *Note*: If parameters are supported and in FP16, please set :attr:`n`=2, :attr:`m`=4, 
-    if they in FP32, then :attr:`n`=1, :attr:`m`=2` to further enable Sparse Tensor Core acceleration.
-
-    *Note*: If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
+    *Note*: (Static graph mode) If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
     and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable). 
     Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for 
-    inference only. To obtain OptimizerWithSparsityGuarantee, please see `sparsity.decoreate()`.
+    inference only. To obtain OptimizerWithSparsityGuarantee, please see `paddle.incubate.asp.decoreate()`.
 
     Args:
-        main_program (Program, optional): Program with model definition and its parameters. Default is `paddle.static.default_main_program()
-        n (int): n of `n:m` sparse pattern.
-        m (int): m of `n:m` sparse pattern.
+        model (Program|nn.Layer): Program with model definition and its parameters, or a object of `paddle.nn.Layer`.
+        n (int, optional): n of `n:m` sparse pattern. Default is 2.
+        m (int, optional): m of `n:m` sparse pattern. Default is 4.
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddle.static import sparsity
-
-            paddle.enable_static()
-
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-
-            with paddle.static.program_guard(main_program, startup_program):
-                input_data = paddle.static.data(name='data', shape=[None, 128])
-                label = paddle.static.data(name='label', shape=[None, 10])
-                hidden = paddle.static.nn.fc(x=input_data, num_flatten_dims=-1, size=32, activation=None, name="need_sparse_fc")
-                hidden = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=32, activation=None, name="need_dense_fc")
-                prob = paddle.static.nn.fc(x=hidden, num_flatten_dims=-1, size=10, activation=None)
-                loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
-
-                # Setup exluded layers out from ASP workflow.
-                # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
-                sparsity.set_excluded_layers(main_program, ["need_dense_fc"])
+        1. Usage of Dynamic Graph
 
-                optimizer = paddle.optimizer.SGD(learning_rate=0.1)
-                optimizer = paddle.static.amp.decorate(optimizer )
-                # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
-                # will insert necessary masking operations for ASP workflow.
-                optimizer = sparsity.decorate(optimizer)
-                optimizer.minimize(loss, startup_program)
+            .. code-block:: python
 
-            device = paddle.device.get_device()
-            place = paddle.set_device(device)
+                import paddle
+                import numpy as np
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
+
+                my_layer = MyLayer()
+                loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+                optimizer = paddle.optimizer.SGD(
+                    learning_rate=0.01, parameters=my_layer.parameters())
+
+                # Calling paddle.incubate.asp.decorate() to wrap step() in optimizer, which 
+                # will apply necessary masking operations for ASP workflow.
+                # In dynamic graph mode, ASP would create related mask variables during decoration.
+                optimizer = paddle.incubate.asp.decorate(optimizer)
+
+                # Must call paddle.incubate.asp.decorate() first before calling paddle.incubate.asp.prune_model()
+                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+
+                for i in range(10):
+                    imgs = paddle.to_tensor(
+                        np.random.randn(64, 3, 32, 32),
+                        dtype='float32', stop_gradient=False)
+                    labels = paddle.to_tensor(
+                        np.random.randint(10, size=(64, 1)),
+                        dtype='float32', stop_gradient=False)
+                    output = my_layer(imgs)
+                    loss = loss_fn(output, labels)
+                    loss.backward()
+                    optimizer.step()
+                    optimizer.clear_grad()
+
+        2. Usage of Static Graph
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_program)
+            .. code-block:: python
 
-            # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
-            sparsity.prune_model(main_program, mask_algo='mask_2d_best')
+                import paddle
+                import numpy as np
+
+                paddle.enable_static()
+
+                class MyLayer(paddle.nn.Layer):
+                    def __init__(self):
+                        super(MyLayer, self).__init__()
+                        self.conv1 = paddle.nn.Conv2D(
+                            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+                        self.linear1 = paddle.nn.Linear(4624, 32)
+                        self.linear2 = paddle.nn.Linear(32, 32)
+                        self.linear3 = paddle.nn.Linear(32, 10)
+
+                    def forward(self, img):
+                        hidden = self.conv1(img)
+                        hidden = paddle.flatten(hidden, start_axis=1)
+                        hidden = self.linear1(hidden)
+                        hidden = self.linear2(hidden)
+                        prediction = self.linear3(hidden)
+                        return prediction
+
+                main_program = paddle.static.Program()
+                startup_program = paddle.static.Program()
+
+                with paddle.static.program_guard(main_program, startup_program):
+                    input_data = paddle.static.data(name='data', shape=[None, 3, 32, 32])
+                    label = paddle.static.data(name='label', shape=[None, 1])
+                    my_layer = MyLayer()
+                    prob = my_layer(input_data)
+                    loss = paddle.mean(paddle.nn.functional.square_error_cost(prob, label))
+
+                    optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+                    # Calling paddle.incubate.asp.decorate() to wrap minimize() in optimizer, which 
+                    # will insert necessary masking operations for ASP workflow.
+                    # In static graph mode, ASP creates related mask variables 
+                    # during minimize().
+                    optimizer = paddle.incubate.asp.decorate(optimizer)
+                    optimizer.minimize(loss, startup_program)
+
+                device = paddle.device.get_device()
+                place = paddle.set_device(device)
+
+                exe = paddle.static.Executor(place)
+                exe.run(startup_program)
+
+                # Must call exe.run(startup_program) first before calling paddle.asp.prune_model()
+                paddle.incubate.asp.prune_model(my_layer, mask_algo='mask_2d_best')
+                # it also be accepted to call 
+                # paddle.incubate.asp.prune_model(main_program, mask_algo='mask_2d_best')
+
+                for i in range(10):
+                    imgs = np.random.randn(64, 3, 32, 32).astype('float32')
+                    labels = np.random.randint(10, size=(64, 1)).astype('float32')
+                    exe.run(main_program, feed={'data':imgs, 'label':labels})
     """
-    if main_program is not None and hasattr(
-            main_program,
-            "distributed_info_") and main_program.distributed_info_[
-                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
-        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
-        place = paddle.CUDAPlace(gpu_id)
-    else:
-        device = paddle.device.get_device()
-        place = paddle.set_device(device)
+    device = paddle.device.get_device()
+    place = paddle.set_device(device)
 
     MaskAlgo_mapping = {
         'mask_1d': sparsity.MaskAlgo.MASK_1D,
@@ -237,11 +440,26 @@ def prune_model(main_program=None,
         'mask_2d_best': sparsity.MaskAlgo.MASK_2D_BEST
     }
     assert (mask_algo in MaskAlgo_mapping), \
-           'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+        'The "mask_algo" should be one of ["mask_1d", "mask_2d_greedy", "mask_2d_best"]'
+
+    prune_func = None
+    if isinstance(model, paddle.nn.Layer):
+        prune_func = ASPHelper.prune_model_by_layer
+    elif isinstance(model, paddle.static.Program):
+        prune_func = ASPHelper.prune_model_by_program
+        if hasattr(model, "distributed_info_") and \
+           model.distributed_info_["sharding_degree"] > 1 and \
+           paddle.fluid.is_compiled_with_cuda():
+            gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+            place = paddle.CUDAPlace(gpu_id)
+    else:
+        raise TypeError(
+            "model should be paddle.nn.Layer or paddle.static.Program, but got {}".
+            format(type(model)))
 
-    return ASPHelper.prune_model(
-        place=place,
-        main_program=main_program,
+    return prune_func(
+        place,
+        model,
         n=n,
         m=m,
         mask_algo=MaskAlgo_mapping[mask_algo],
@@ -300,7 +518,7 @@ class ASPHelper(object):
     __asp_info = {}
 
     @classmethod
-    def set_excluded_layers(cls, main_program, param_names):
+    def set_excluded_layers(cls, param_names, main_program):
         r"""
         This is the implementation of `sparsity.set_excluded_layers`, for details please see explanation in `sparsity.set_excluded_layers`.
         """
@@ -313,8 +531,8 @@ def reset_excluded_layers(cls, main_program=None):
         This is the implementation of `sparsity.reset_excluded_layers`, for details please see explanation in `sparsity.reset_excluded_layers`.
         """
         if main_program is None:
-            for asp_info in cls.__asp_info:
-                asp_info.reset_excluded_layers()
+            for prog in cls.__asp_info:
+                cls.__asp_info[prog].reset_excluded_layers()
         else:
             cls._get_program_asp_info(main_program).reset_excluded_layers()
 
@@ -323,16 +541,25 @@ def decorate(optimizer):
         r"""
         This is the implementation of `sparsity.decorate`, for details please see explanation in `sparsity.decorate`.
         """
+        if paddle.in_dynamic_mode():
+            # main_prog and startup_prog would be used with paddle.static.program_guard
+            # to create ASP masks. Moreover, main_prog is a key to map paddle.static.Program
+            # to its own ASP informantion, like ASP mask variables. For dynamic graph, we use
+            # default_main_program as the key.
+            main_prog = paddle.static.default_main_program()
+            startup_prog = paddle.static.default_startup_program()
+            ASPHelper._create_mask_variables(main_prog, startup_prog,
+                                             optimizer._parameter_list)
         return OptimizerWithSparsityGuarantee(optimizer)
 
     @classmethod
-    def prune_model(cls,
-                    place,
-                    main_program=None,
-                    n=2,
-                    m=4,
-                    mask_algo=sparsity.MaskAlgo.MASK_1D,
-                    with_mask=True):
+    def prune_model_by_program(cls,
+                               place,
+                               main_program=None,
+                               n=2,
+                               m=4,
+                               mask_algo=sparsity.MaskAlgo.MASK_1D,
+                               with_mask=True):
         r"""
         This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
         """
@@ -366,9 +593,63 @@ def prune_model(cls,
                         np.array(weight_mask_tensor).dtype)
                     weight_mask_tensor.set(weight_sparse_mask, place)
                 asp_info.update_masks(param.name, weight_sparse_mask)
-
         return asp_info.masks.copy()
 
+    @classmethod
+    def prune_model_by_layer(cls,
+                             place,
+                             layer,
+                             n=2,
+                             m=4,
+                             mask_algo=sparsity.MaskAlgo.MASK_1D,
+                             with_mask=True):
+        r"""
+        This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
+        """
+        if paddle.in_dynamic_mode():
+            main_program = paddle.static.default_main_program()
+            asp_info = cls._get_program_asp_info(main_program)
+
+            for param in layer.parameters():
+                if ASPHelper._is_supported_layer(main_program, param.name):
+                    weight_nparray = param.numpy()
+
+                    prune_func = ASPHelper._get_prune_func_by_name(param.name)
+
+                    weight_pruned_nparray, weight_sparse_mask = \
+                        prune_func(weight_nparray, m, n, mask_algo, param.name)
+
+                    weight_pruned_nparray = weight_pruned_nparray.astype(
+                        weight_nparray.dtype)
+                    param.set_value(weight_pruned_nparray)
+
+                    if with_mask:
+                        weight_mask_param = asp_info.mask_vars.get(param.name,
+                                                                   None)
+                        assert weight_mask_param is not None, \
+                            'Cannot find {} variable, please call sparsity.decorate() to' \
+                            ' decorate your optimizer first!'.format(ASPHelper._get_mask_name(param.name))
+                        weight_mask_param.set_value(weight_sparse_mask)
+
+                    asp_info.update_masks(param.name, weight_sparse_mask)
+
+            return asp_info.masks.copy()
+        else:
+            # This for loop is only used to obtain Block and Program from
+            # first parameters.
+            target_program = None
+            for param in layer.parameters():
+                target_program = param.block.program
+            assert target_program is not None, \
+                    'Cannot get paddle.static.Program from Paddle.nn.Layer.'
+            return ASPHelper.prune_model_by_program(
+                place,
+                target_program,
+                n=n,
+                m=m,
+                mask_algo=mask_algo,
+                with_mask=with_mask)
+
     @staticmethod
     def _get_mask_name(param_name):
         r"""
@@ -393,13 +674,15 @@ def _get_not_ASP_relevant_vars(main_program):
         """
         var_list = []
         for param in main_program.global_block().all_parameters():
-            if ASPHelper.MASK_APPENDDED_NAME not in param.name:
+            param_name_list = param.name.split('.')
+
+            if ASPHelper.MASK_APPENDDED_NAME not in param_name_list:
                 var_list.append(param)
         return var_list
 
     @classmethod
     def _get_program_asp_info(cls, main_program):
-        if not main_program in cls.__asp_info:
+        if main_program not in cls.__asp_info:
             cls.__asp_info[main_program] = ProgramASPInfo()
         return cls.__asp_info[main_program]
 
@@ -508,14 +791,37 @@ def _minimize(cls,
 
         optimizer_ops, params_and_grads = optimizer.minimize(
             loss, startup_program, parameter_list, no_grad_set=no_grad_set)
-        cls._create_mask_variables(main_program, startup_program,
-                                   params_and_grads)
-        cls._insert_sparse_mask_ops(main_program, params_and_grads)
+
+        params_only = [pg[0] for pg in params_and_grads]
+        cls._create_mask_variables(main_program, startup_program, params_only)
+        cls._insert_sparse_mask_ops(main_program, params_only)
         return optimizer_ops, params_and_grads
 
     @classmethod
-    def _create_mask_variables(cls, main_program, startup_program,
-                               params_and_grads):
+    @dygraph_only
+    def _step(cls, optimizer):
+        r"""
+        This function is a decorator of `step` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.step()
+        2. Mask parameters with sparse masks.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+        """
+        optimizer.step()
+        main_prog = paddle.static.default_main_program()
+        with paddle.fluid.dygraph.no_grad():
+            ASPHelper._insert_sparse_mask_ops(main_prog,
+                                              optimizer._parameter_list)
+
+    @classmethod
+    def _create_mask_variables(cls, main_program, startup_program, params):
         r"""
         Create sparse mask Tensors according to supported layers in :attr:`main_program`.
         This function is called in second step of `ASPHelper._minimize`
@@ -523,48 +829,45 @@ def _create_mask_variables(cls, main_program, startup_program,
         Args:
             main_program (Program): Program with model definition and its parameters.
             startup_program (Program): Program for initializing parameters.
-            params_and_grads (list): Variable pairs of parameters and their gradients.
+            params (list): Variable parameters.
         """
         asp_info = cls._get_program_asp_info(main_program)
         with program_guard(main_program, startup_program):
-            for param_and_grad in params_and_grads:
-                if ASPHelper._is_supported_layer(main_program,
-                                                 param_and_grad[0].name):
-                    mask_param = layers.create_parameter(
-                        name=ASPHelper._get_mask_name(param_and_grad[0].name),
-                        shape=param_and_grad[0].shape,
-                        dtype=param_and_grad[0].dtype,
-                        default_initializer=ConstantInitializer(value=1.0))
-                    mask_param.stop_gradient = True
-                    mask_param.trainable = False
-                    asp_info.update_mask_vars(param_and_grad[0].name,
-                                              mask_param)
+            for param in params:
+                if ASPHelper._is_supported_layer(main_program, param.name):
+                    if param.name not in asp_info.mask_vars:
+                        mask_param = layers.create_parameter(
+                            name=ASPHelper._get_mask_name(param.name),
+                            shape=param.shape,
+                            dtype=param.dtype,
+                            default_initializer=ConstantInitializer(value=1.0))
+                        mask_param.stop_gradient = True
+                        mask_param.trainable = False
+                        asp_info.update_mask_vars(param.name, mask_param)
 
     @classmethod
-    def _insert_sparse_mask_ops(cls, main_program, param_grads):
+    def _insert_sparse_mask_ops(cls, main_program, params):
         r"""
         Insert masking ops in the end of parameters update.
         This function is called in third step of `ASPHelper._minimize`
 
         Args:
             main_program (Program): Program with model definition and its parameters.
-            params_and_grads (list): Variable pairs of parameters and their gradients.
+            params (list): Variable parameters.
         """
         block = main_program.global_block()
         asp_info = cls._get_program_asp_info(main_program)
-        for param_grad in param_grads:
-            if param_grad[0].name in asp_info.mask_vars:
+        for param in params:
+            if param.name in asp_info.mask_vars:
                 block.append_op(
                     type='elementwise_mul',
-                    inputs={
-                        "X": param_grad[0],
-                        'Y': asp_info.mask_vars[param_grad[0].name]
-                    },
-                    outputs={'Out': param_grad[0]},
+                    inputs={"X": param,
+                            'Y': asp_info.mask_vars[param.name]},
+                    outputs={'Out': param},
                     attrs={
                         'axis': -1,
                         'use_mkldnn': False,
-                        OP_ROLE_KEY: OpRole.Optimize
+                        OP_ROLE_KEY: int(OpRole.Optimize)
                     })
 
 
@@ -579,8 +882,9 @@ class OptimizerWithSparsityGuarantee(object):
 
     def __init__(self, optimizer):
         self._optimizer = optimizer
-        self._learning_rate = optimizer._learning_rate
-        self._learning_rate_map = optimizer._learning_rate_map
+
+    def __getattr__(self, item):
+        return getattr(self._optimizer, item)
 
     def minimize(self,
                  loss,
@@ -605,3 +909,55 @@ def minimize(self,
             startup_program=startup_program,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
+
+    @dygraph_only
+    def step(self):
+        r"""
+        This function is a decorator of `step` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.step()
+        2. Mask parameters with sparse masks.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+        """
+        ASPHelper._step(self._optimizer)
+
+    @dygraph_only
+    def state_dict(self):
+        r"""
+        This function is a decorator of `state_dict` function in `Optimizer`.
+
+        Returns:
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+        """
+        state_dict = self._optimizer.state_dict()
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name, var in asp_info.mask_vars.items():
+            state_dict.update({ASPHelper._get_mask_name(param_name): var})
+        return state_dict
+
+    @dygraph_only
+    def set_state_dict(self, state_dict):
+        r"""
+        This function is a decorator of `set_state_dict` function in `Optimizer`.
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+        Return:
+            None
+        """
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name, var in asp_info.mask_vars.items():
+            param_mask_name = ASPHelper._get_mask_name(param_name)
+            assert param_mask_name in state_dict, \
+                "The {} is not found.".format(param_mask_name)
+            var.set_value(state_dict[param_mask_name])
+            asp_info.update_masks(param_name, var.numpy())
+        return self._optimizer.set_state_dict(state_dict)
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index 8b8c043bc4bad..a28f7fc2b4ed6 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -94,13 +94,12 @@ def calculate_density(x):
         float: The density of :attr:`x`.
     Examples:
         .. code-block:: python
-
+          import paddle
           import numpy as np
-          import paddle.static.sparsity as sparsity
 
           x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          sparsity.calculate_density(x) # 0.625
+          paddle.incubate.asp.calculate_density(x) # 0.625
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
index b6b313465ab20..76856d88e1789 100644
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -1,8 +1,8 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp")
-list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_amp")
+list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_static")
+list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_dynamic")
 list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_sharding")
 
 foreach(TEST_OP ${TEST_OPS})
@@ -10,9 +10,9 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 
 if(WITH_DISTRIBUTE)
-    py_test_modules(test_fleet_with_asp MODULES test_fleet_with_asp ENVS ${dist_ENVS})
     if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-        py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_with_asp_dynamic MODULES test_fleet_with_asp_dynamic ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_with_asp_static MODULES test_fleet_with_asp_static ENVS ${dist_ENVS})
     endif()
 endif()
 
@@ -21,3 +21,8 @@ if((WITH_DISTRIBUTE) AND (NOT WIN32) AND (NOT APPLE))
         py_test_modules(test_fleet_with_asp_sharding MODULES test_fleet_with_asp_sharding ENVS ${dist_ENVS})
     endif()
 endif()
+
+set_tests_properties(test_asp_pruning_dynamic PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_pruning_static PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_dynamic PROPERTIES TIMEOUT 30)
+set_tests_properties(test_asp_optimize_static PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
index d41a7b2b842e8..e594bc5c34eb3 100644
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -20,7 +20,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
@@ -60,7 +59,7 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
             loss = fluid.layers.mean(
                 fluid.layers.cross_entropy(
                     input=self.predict, label=self.label))
-            optimizer = sparsity.decorate(
+            optimizer = paddle.incubate.asp.decorate(
                 fluid.optimizer.SGD(learning_rate=0.01))
             optimizer.minimize(loss, self.startup_program)
 
@@ -75,7 +74,7 @@ def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
     def __pruning_and_checking(self, exe, place, mask_func_name,
                                check_func_name, with_mask):
         exe.run(self.startup_program)
-        sparsity.prune_model(
+        paddle.incubate.asp.prune_model(
             self.main_program, mask_algo=mask_func_name, with_mask=with_mask)
         for param in self.main_program.global_block().all_parameters():
             if ASPHelper._is_supported_layer(self.main_program, param.name):
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
index a2b499a9e01c3..dca56076dbceb 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py
@@ -66,6 +66,97 @@ def test_add_supported_layer_via_name(self):
             my_own_layer_name in supported_layers_and_prune_func_map)
 
 
+class TestASPDynamicCustomerizedPruneFunc(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+        class CustomerLayer(paddle.nn.Layer):
+            def __init__(self):
+                super(CustomerLayer, self).__init__()
+
+                self.weight = self.create_parameter(
+                    shape=[32, 32], attr=None, dtype='float32', is_bias=False)
+                self.linear1 = paddle.nn.Linear(32, 32)
+                self.linear2 = paddle.nn.Linear(32, 10)
+
+            def forward(self, input_):
+                hidden = paddle.nn.functional.linear(
+                    x=input_, weight=self.weight)
+                hidden = self.linear1(hidden)
+                out = self.linear2(hidden)
+                return out
+
+        sparsity.add_supported_layer(CustomerLayer, my_own_pruning)
+
+        self.layer = CustomerLayer()
+        self.customer_prefix = paddle.fluid.dygraph.layers._convert_camel_to_snake(
+            CustomerLayer.__name__)
+        self.supported_layer_count_ref = 3
+
+    def test_inference_pruning(self):
+
+        sparsity.prune_model(self.layer, mask_algo="mask_1d", with_mask=False)
+
+        supported_layer_count = 0
+        for param in self.layer.parameters():
+            mat = param.numpy()
+
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+    def test_training_pruning(self):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         parameters=self.layer.parameters())
+        optimizer = sparsity.decorate(optimizer)
+
+        sparsity.prune_model(self.layer, mask_algo="mask_1d", with_mask=True)
+
+        supported_layer_count = 0
+        for param in self.layer.parameters():
+            mat = param.numpy()
+
+            if sparsity.asp.ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+
+                mat_mask = sparsity.asp.ASPHelper._get_program_asp_info(
+                    paddle.static.default_main_program()).mask_vars[
+                        param.name].numpy()
+
+                supported_layer_count += 1
+                if (self.customer_prefix in param.name):
+                    self.assertLessEqual(
+                        np.sum(mat.flatten() - static_tensor.flatten()), 1e-4)
+                    self.assertLessEqual(
+                        np.sum(mat_mask.flatten() - static_tensor_mask.flatten(
+                        )), 1e-4)
+                else:
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+                    self.assertTrue(
+                        sparsity.check_sparsity(
+                            mat_mask.T,
+                            func_name=sparsity.CheckMethod.CHECK_1D,
+                            n=2,
+                            m=4))
+        self.assertEqual(supported_layer_count, self.supported_layer_count_ref)
+
+
 class TestASPStaticCustomerizedPruneFunc(unittest.TestCase):
     def setUp(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
new file mode 100644
index 0000000000000..e127dca225116
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_dynamic.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(1352, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPDynamicOptimize(unittest.TestCase):
+    def setUp(self):
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_is_supported_layers(self):
+        program = paddle.static.default_main_program()
+
+        names = [
+            'embedding_0.w_0', 'fack_layer_0.w_0', 'conv2d_0.w_0',
+            'conv2d_0.b_0', 'conv2d_1.w_0', 'conv2d_1.b_0', 'fc_0.w_0',
+            'fc_0.b_0', 'fc_1.w_0', 'fc_1.b_0', 'linear_2.w_0', 'linear_2.b_0'
+        ]
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        paddle.incubate.asp.set_excluded_layers(['fc_1', 'conv2d_0'])
+        ref = [
+            False, False, False, False, True, False, True, False, False, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        paddle.incubate.asp.reset_excluded_layers()
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+    def test_decorate(self):
+        param_names = [param.name for param in self.layer.parameters()]
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        program = paddle.static.default_main_program()
+
+        for name in param_names:
+            mask_var = ASPHelper._get_program_asp_info(program).mask_vars.get(
+                name, None)
+            if ASPHelper._is_supported_layer(program, name):
+                self.assertTrue(mask_var is not None)
+            else:
+                self.assertTrue(mask_var is None)
+
+    def test_asp_training(self):
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        paddle.incubate.asp.prune_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(32, 3, 24, 24),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(32, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+    def test_asp_training_with_amp(self):
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+
+        paddle.incubate.asp.prune_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(32, 3, 24, 24),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(32, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        with paddle.amp.auto_cast(enable=True):
+            output = self.layer(imgs)
+            loss = loss_fn(output, labels)
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(self.optimizer, scaled)
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
similarity index 89%
rename from python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
rename to python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
index 9e5e3c924f1a5..b51e28cdcb9fc 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,21 +20,20 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.static import sparsity
 from paddle.fluid.contrib.sparsity.asp import ASPHelper
 import numpy as np
 
 paddle.enable_static()
 
 
-class TestASPHelper(unittest.TestCase):
+class TestASPStaticOptimize(unittest.TestCase):
     def setUp(self):
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
 
         def build_model():
             img = fluid.data(
-                name='img', shape=[None, 3, 32, 32], dtype='float32')
+                name='img', shape=[None, 3, 24, 24], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
             hidden = fluid.layers.conv2d(
                 input=img, num_filters=4, filter_size=3, padding=2, act="relu")
@@ -87,7 +86,7 @@ def test_is_supported_layers(self):
             self.assertTrue(
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
-        sparsity.set_excluded_layers(program, ['fc_1', 'conv2d_0'])
+        paddle.incubate.asp.set_excluded_layers(['fc_1', 'conv2d_0'], program)
         ref = [
             False, False, False, False, True, False, True, False, False, False,
             True, False
@@ -96,7 +95,7 @@ def test_is_supported_layers(self):
             self.assertTrue(
                 ref[i] == ASPHelper._is_supported_layer(program, name))
 
-        sparsity.reset_excluded_layers(program)
+        paddle.incubate.asp.reset_excluded_layers(program)
         ref = [
             False, False, True, False, True, False, True, False, True, False,
             True, False
@@ -109,7 +108,7 @@ def test_decorate(self):
         param_names = self.__get_param_names(self.main_program.global_block()
                                              .all_parameters())
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
         param_names_after_minimize = self.__get_param_names(
             self.main_program.global_block().all_parameters())
@@ -119,7 +118,7 @@ def test_decorate(self):
 
     def test_asp_training(self):
         with fluid.program_guard(self.main_program, self.startup_program):
-            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
             self.optimizer.minimize(self.loss, self.startup_program)
 
         place = paddle.CPUPlace()
@@ -129,10 +128,10 @@ def test_asp_training(self):
         feeder = fluid.DataFeeder(feed_list=[self.img, self.label], place=place)
 
         exe.run(self.startup_program)
-        sparsity.prune_model(self.main_program)
+        paddle.incubate.asp.prune_model(self.main_program)
 
-        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-            10, size=(64, 1)))
+        data = (np.random.randn(32, 3, 24, 24), np.random.randint(
+            10, size=(32, 1)))
         exe.run(self.main_program, feed=feeder.feed([data]))
 
         for param in self.main_program.global_block().all_parameters():
@@ -149,7 +148,7 @@ def test_asp_training_with_amp(self):
             with fluid.program_guard(self.main_program, self.startup_program):
                 self.optimizer = fluid.contrib.mixed_precision.decorator.decorate(
                     self.optimizer)
-                self.optimizer = sparsity.decorate(self.optimizer)
+                self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
                 self.optimizer.minimize(self.loss, self.startup_program)
 
             exe = fluid.Executor(place)
@@ -157,10 +156,10 @@ def test_asp_training_with_amp(self):
                 feed_list=[self.img, self.label], place=place)
 
             exe.run(self.startup_program)
-            sparsity.prune_model(self.main_program)
+            paddle.incubate.asp.prune_model(self.main_program)
 
-            data = (np.random.randn(64, 3, 32, 32), np.random.randint(
-                10, size=(64, 1)))
+            data = (np.random.randn(32, 3, 24, 24), np.random.randint(
+                10, size=(32, 1)))
             exe.run(self.main_program, feed=feeder.feed([data]))
 
             for param in self.main_program.global_block().all_parameters():
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
deleted file mode 100644
index e99509187038c..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import paddle
-import unittest
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning2DBest(TestASPHelperPruningBase):
-    def test_2D_best_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-    def test_2D_best_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_2d_best', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
deleted file mode 100644
index 7ad6c3ae02275..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning2DGreedy(TestASPHelperPruningBase):
-    def test_2D_greedy_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_2d_greedy',
-            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-    def test_2D_greedy_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_2d_greedy',
-            paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
new file mode 100644
index 0000000000000..b0fad0b64002a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_dynamic.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.fluid import core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=2, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(1352, 32)
+        self.linear2 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        prediction = self.linear2(hidden)
+        return prediction
+
+
+class TestASPDynamicPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.layer = MyLayer()
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+
+        self.img = paddle.to_tensor(
+            np.random.uniform(
+                low=-0.5, high=0.5, size=(32, 3, 24, 24)),
+            dtype=np.float32,
+            place=place,
+            stop_gradient=False)
+
+        self.set_config()
+
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+    def test_inference_pruning(self):
+        self.__pruning_and_checking(False)
+
+    def test_training_pruning(self):
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         parameters=self.layer.parameters())
+        optimizer = paddle.incubate.asp.decorate(optimizer)
+
+        self.__pruning_and_checking(True)
+
+    def __pruning_and_checking(self, with_mask):
+
+        paddle.incubate.asp.prune_model(
+            self.layer, mask_algo=self.mask_gen_func, with_mask=with_mask)
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, func_name=self.mask_check_func, n=2, m=4))
+
+
+class TestASPDynamicPruning1D(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+
+class TestASPDynamicPruning2DBest(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_best'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+class TestASPDynamicPruning2DGreedy(TestASPDynamicPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_greedy'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
new file mode 100644
index 0000000000000..a9986f24b0265
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPStaticPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 24, 24], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=2, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='softmax')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+
+        self.set_config()
+
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+    def test_inference_pruning(self):
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, False)
+
+    def test_training_pruning(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = paddle.incubate.asp.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, True)
+
+    def __pruning_and_checking(self, exe, place, with_mask):
+        exe.run(self.startup_program)
+        paddle.incubate.asp.prune_model(
+            self.main_program,
+            mask_algo=self.mask_gen_func,
+            with_mask=with_mask)
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, func_name=self.mask_check_func, n=2, m=4))
+
+
+class TestASPStaticPruning1D(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_1d'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D
+
+
+class TestASPStaticPruning2DBest(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_best'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+class TestASPStaticPruning2DGreedy(TestASPStaticPruningBase):
+    def set_config(self):
+        self.mask_gen_func = 'mask_2d_greedy'
+        self.mask_check_func = paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
new file mode 100644
index 0000000000000..653cbbf84091b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=3, out_channels=4, kernel_size=3, padding=2)
+        self.linear1 = paddle.nn.Linear(4624, 32)
+        self.linear2 = paddle.nn.Linear(32, 32)
+        self.linear3 = paddle.nn.Linear(32, 10)
+
+    def forward(self, img):
+        hidden = self.conv1(img)
+        hidden = paddle.flatten(hidden, start_axis=1)
+        hidden = self.linear1(hidden)
+        hidden = self.linear2(hidden)
+        prediction = self.linear3(hidden)
+        return prediction
+
+
+class TestASPDynamicOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_dy/"
+        net_path = path + "asp_net.pdparams"
+        opt_path = path + "asp_opt.pdopt"
+
+        paddle.save(self.layer.state_dict(), net_path)
+        paddle.save(self.optimizer.state_dict(), opt_path)
+
+        asp_info = ASPHelper._get_program_asp_info(
+            paddle.static.default_main_program())
+        for param_name in asp_info.mask_vars:
+            mask = asp_info.mask_vars[param_name]
+            asp_info.update_mask_vars(
+                param_name, paddle.ones(
+                    shape=mask.shape, dtype=mask.dtype))
+            asp_info.update_masks(param_name, np.ones(shape=mask.shape))
+
+        net_state_dict = paddle.load(net_path)
+        opt_state_dict = paddle.load(opt_path)
+
+        self.layer.set_state_dict(net_state_dict)
+        self.optimizer.set_state_dict(opt_state_dict)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 3, 32, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestASPStaticOptimize(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, predict = build_model()
+            self.loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict, label=self.label))
+            self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+            self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+        self.exe = fluid.Executor(self.place)
+        self.exe.run(self.startup_program)
+
+        paddle.incubate.asp.prune_model(self.main_program)
+
+    def test_save_and_load(self):
+        path = "/tmp/paddle_asp_save_st/"
+        param_path = path + "asp.pdparams"
+        model_path = path + "asp.pdmodel"
+
+        paddle.save(self.main_program.state_dict(), param_path)
+        paddle.save(self.main_program, model_path)
+
+        prog = paddle.load(model_path)
+
+        state_dict = paddle.load(param_path)
+        prog.set_state_dict(state_dict)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[self.img, self.label], place=self.place)
+
+        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+            10, size=(64, 1)))
+        self.exe.run(prog, feed=feeder.feed([data]))
+
+        for param in prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index 4aac878763b6f..67ec54367d382 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -18,7 +18,6 @@
 import unittest
 import threading, time
 import paddle
-from paddle.static import sparsity
 import numpy as np
 
 
@@ -41,9 +40,9 @@ def test_density(self):
         x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertEqual(sparsity.calculate_density(x), 0.56)
+        self.assertEqual(paddle.incubate.asp.calculate_density(x), 0.56)
         x[:, 0] = 0.0
-        self.assertEqual(sparsity.calculate_density(x), 0.4)
+        self.assertEqual(paddle.incubate.asp.calculate_density(x), 0.4)
 
     def test_check_mask_1d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
@@ -219,3 +218,7 @@ def __test_1D_2D_sparse_mask_generation_methods(self, x):
                 func_name=paddle.fluid.contrib.sparsity.CheckMethod.CHECK_2D,
                 n=2,
                 m=4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
deleted file mode 100644
index 074aedb947613..0000000000000
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
-import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import os
-from paddle.static import sparsity
-from paddle.fluid.contrib.sparsity.asp import ASPHelper
-import numpy as np
-cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
-if cuda_visible_devices is None or cuda_visible_devices == "":
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-else:
-    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
-
-paddle.enable_static()
-
-
-class TestFleetWithASP(unittest.TestCase):
-    def setUp(self):
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
-        os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["PADDLE_TRAINER_ID"] = "0"
-
-    def net(self, main_prog, startup_prog):
-        with fluid.program_guard(main_prog, startup_prog):
-            input_x = paddle.static.data(
-                name="x", shape=[-1, 32], dtype='float32')
-            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
-
-            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
-            avg_cost = paddle.mean(x=cost)
-
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.asp = True
-        return avg_cost, strategy, input_x, input_y
-
-    def test_with_asp(self):
-        fleet.init(is_collective=True)
-        train_prog, startup_prog = fluid.Program(), fluid.Program()
-        avg_cost, strategy, input_x, input_y = self.net(train_prog,
-                                                        startup_prog)
-
-        with fluid.program_guard(train_prog, startup_prog):
-            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(
-                optimizer, strategy=strategy)
-            optimizer.minimize(avg_cost)
-
-        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-
-        exe = fluid.Executor(place)
-        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
-        exe.run(startup_prog)
-
-        sparsity.prune_model(train_prog)
-
-        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
-        exe.run(train_prog, feed=feeder.feed([data]))
-
-        for param in train_prog.global_block().all_parameters():
-            if ASPHelper._is_supported_layer(train_prog, param.name):
-                mat = np.array(fluid.global_scope().find_var(param.name)
-                               .get_tensor())
-                self.assertTrue(
-                    paddle.fluid.contrib.sparsity.check_sparsity(
-                        mat.T, n=2, m=4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
new file mode 100644
index 0000000000000..3ced15bf15881
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_dynamic.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import os
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
+if cuda_visible_devices is None or cuda_visible_devices == "":
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+else:
+    os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
+
+
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self.linear1 = paddle.nn.Linear(32, 32)
+        self.linear2 = paddle.nn.Linear(32, 10)
+
+    def forward(self, x):
+        hidden = self.linear1(x)
+        prediction = self.linear2(hidden)
+        return prediction
+
+
+class TestFleetWithASPDynamic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+        self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        self.layer = fleet.distributed_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+
+        output = self.layer(imgs)
+        loss = loss_fn(output, labels)
+        loss.backward()
+        self.optimizer.step()
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestFleetWithASPAMPDynamic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+        self.layer = MyLayer()
+
+        self.place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            self.place = paddle.CUDAPlace(0)
+
+        self.optimizer = paddle.optimizer.SGD(
+            learning_rate=0.01, parameters=self.layer.parameters())
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+
+        self.optimizer = paddle.incubate.asp.decorate(self.optimizer)
+        paddle.incubate.asp.prune_model(self.layer)
+
+        self.optimizer = fleet.distributed_optimizer(self.optimizer)
+        self.layer = fleet.distributed_model(self.layer)
+
+        imgs = paddle.to_tensor(
+            np.random.randn(64, 32),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+        labels = paddle.to_tensor(
+            np.random.randint(
+                10, size=(64, 1)),
+            dtype='float32',
+            place=self.place,
+            stop_gradient=False)
+
+        loss_fn = paddle.nn.MSELoss(reduction='mean')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        with paddle.amp.auto_cast(enable=True):
+            output = self.layer(imgs)
+            loss = loss_fn(output, labels)
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(self.optimizer, scaled)
+        self.optimizer.clear_grad()
+
+        for param in self.layer.parameters():
+            if ASPHelper._is_supported_layer(
+                    paddle.static.default_main_program(), param.name):
+                mat = param.numpy()
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
similarity index 67%
rename from python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
rename to python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
index a34d7e69872e2..2023c0051401f 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_amp.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_static.py
@@ -1,5 +1,5 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,62 @@
 paddle.enable_static()
 
 
-class TestFleetWithASP(unittest.TestCase):
+class TestFleetWithASPStatic(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def net(self, main_prog, startup_prog):
+        with fluid.program_guard(main_prog, startup_prog):
+            input_x = paddle.static.data(
+                name="x", shape=[-1, 32], dtype='float32')
+            input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+            fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+            cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+            avg_cost = paddle.mean(x=cost)
+
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.asp = True
+        return avg_cost, strategy, input_x, input_y
+
+    def test_with_asp(self):
+        fleet.init(is_collective=True)
+        train_prog, startup_prog = fluid.Program(), fluid.Program()
+        avg_cost, strategy, input_x, input_y = self.net(train_prog,
+                                                        startup_prog)
+
+        with fluid.program_guard(train_prog, startup_prog):
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
+        exe.run(startup_prog)
+
+        sparsity.prune_model(train_prog)
+
+        data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
+        exe.run(train_prog, feed=feeder.feed([data]))
+
+        for param in train_prog.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(train_prog, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    paddle.fluid.contrib.sparsity.check_sparsity(
+                        mat.T, n=2, m=4))
+
+
+class TestFleetWithASPAMPStatic(unittest.TestCase):
     def setUp(self):
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
         os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 0ae005430e03b..28e03fdfd70e1 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index ff7a167f1a670..c354baf3b43b7 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -32,6 +32,7 @@
 import paddle.incubate.autotune
 
 from . import nn  #noqa: F401
+from . import asp  #noqa: F401
 
 __all__ = [
     'LookAhead',
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py b/python/paddle/incubate/asp/__init__.py
similarity index 51%
rename from python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
rename to python/paddle/incubate/asp/__init__.py
index 7a3fa0244930c..59f794ef28aa4 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
+++ b/python/paddle/incubate/asp/__init__.py
@@ -13,25 +13,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
-import paddle
-from paddle.static import sparsity
-from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
-
-paddle.enable_static()
-
-
-class TestASPHelperPruning1D(TestASPHelperPruningBase):
-    def test_1D_inference_pruning(self):
-        self.run_inference_pruning_test(
-            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
-
-    def test_1D_training_pruning(self):
-        self.run_training_pruning_test(
-            'mask_1d', paddle.fluid.contrib.sparsity.CheckMethod.CHECK_1D)
-
-
-if __name__ == '__main__':
-    unittest.main()
+from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
+from ...fluid.contrib.sparsity import decorate  #noqa: F401
+from ...fluid.contrib.sparsity import prune_model  #noqa: F401
+from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
+from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+
+__all__ = [     #noqa
+    'calculate_density',
+    'decorate',
+    'prune_model',
+    'set_excluded_layers',
+    'reset_excluded_layers'
+]
diff --git a/python/paddle/static/sparsity/__init__.py b/python/paddle/static/sparsity/__init__.py
index 59f794ef28aa4..b4543b8d000fc 100644
--- a/python/paddle/static/sparsity/__init__.py
+++ b/python/paddle/static/sparsity/__init__.py
@@ -16,8 +16,14 @@
 from ...fluid.contrib.sparsity import calculate_density  #noqa: F401
 from ...fluid.contrib.sparsity import decorate  #noqa: F401
 from ...fluid.contrib.sparsity import prune_model  #noqa: F401
-from ...fluid.contrib.sparsity import set_excluded_layers  #noqa: F401
 from ...fluid.contrib.sparsity import reset_excluded_layers  #noqa: F401
+from ...fluid.contrib import sparsity  #noqa: F401
+
+
+def set_excluded_layers(main_program, param_names):
+    sparsity.set_excluded_layers(
+        param_names=param_names, main_program=main_program)
+
 
 __all__ = [     #noqa
     'calculate_density',
diff --git a/python/setup.py.in b/python/setup.py.in
index c1a6e3d3947a9..2a0d745729aab 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -281,6 +281,7 @@ packages=['paddle',
           'paddle.incubate.tensor',
           'paddle.incubate.multiprocessing',
           'paddle.incubate.nn',
+          'paddle.incubate.asp',
           'paddle.incubate.passes',
           'paddle.distribution',
           'paddle.distributed.sharding',

From 71b046cda4d2c1751cfbc280e3695261f12fe8b4 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Wed, 25 May 2022 10:58:15 +0800
Subject: [PATCH 032/109] [EinsumOp] Optimize the backward speed of EinsumOp
 (#42663)

* change logic for optimize

* modifty

* optimize the backward speed of EinsumOp

* add cache optimizer for einsum op

* EinsumOp: fix new dygraph mode error

* fix bug

* change Cache->InnerCache

* fix code

* fix

* add nan inf utils for einsum op

* add as_extra

* Compatible with v2.3 EinsumOp

* remove dispensable
---
 paddle/fluid/eager/nan_inf_utils.cc           |  6 ++
 paddle/fluid/eager/nan_inf_utils.h            |  4 ++
 paddle/fluid/operators/einsum_op.cc           |  8 +++
 paddle/phi/infermeta/unary.cc                 |  3 +-
 paddle/phi/infermeta/unary.h                  |  3 +-
 paddle/phi/kernels/cpu/einsum_kernel.cc       |  3 +-
 paddle/phi/kernels/einsum_grad_kernel.h       |  1 +
 paddle/phi/kernels/einsum_kernel.h            |  7 +++
 paddle/phi/kernels/gpu/einsum_grad_kernel.cu  |  9 ++-
 paddle/phi/kernels/gpu/einsum_kernel.cu       |  9 ++-
 paddle/phi/kernels/impl/einsum_grad_impl.h    | 41 ++++++++++--
 paddle/phi/kernels/impl/einsum_impl.h         | 63 +++++++++++++++----
 paddle/phi/ops/compat/einsum_sig.cc           |  7 ++-
 .../fluid/tests/unittests/test_einsum_op.py   |  8 ++-
 .../white_list/no_check_set_white_list.py     |  1 +
 python/paddle/tensor/einsum.py                | 14 +++--
 python/paddle/utils/code_gen/api.yaml         |  2 +-
 python/paddle/utils/code_gen/api_base.py      |  6 +-
 python/paddle/utils/code_gen/api_gen.py       |  4 +-
 python/paddle/utils/code_gen/backward.yaml    |  4 +-
 20 files changed, 165 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index d676955016684..d1c5983a3702f 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -110,4 +110,10 @@ void CheckTensorHasNanOrInf(
   }
 }
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTensorAndVector& tensors) {
+  CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
+  CheckTensorHasNanOrInf(api_name, std::get<1>(tensors));
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index 5309eeb2959dc..a411504fa4900 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -31,6 +31,7 @@ using TupleOfFourTensors = std::tuple<Tensor, Tensor, Tensor, Tensor>;
 using TupleOfFiveTensors = std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>;
 using TupleOfSixTensors =
     std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>;
+using TupleOfTensorAndVector = std::tuple<Tensor, std::vector<Tensor>>;
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
@@ -52,6 +53,9 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const std::vector<Tensor>& tensors);
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const TupleOfTensorAndVector& tensors);
+
 void CheckTensorHasNanOrInf(
     const std::string& api_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 8fdde1ccdc058..6da0045443ccc 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -33,6 +33,13 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Operands", "(TensorList), The input tensor of einsum op.")
         .AsDuplicable();
     AddOutput("Out", "(Tensor), The output tensor of einsum op.");
+    AddOutput(
+        "InnerCache",
+        "(Tensor), The cache of the forward transpose tensors: tA and tB.")
+        .AsDuplicable()
+        .AsExtra()
+        .AsIntermediate();
+
     AddAttr<std::string>("equation",
                          "(string) A einsum equation. such as `ij,jk->ik`"
                          "There must have `->` and the number of operands in "
@@ -72,6 +79,7 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> retv) const override {
     retv->SetType("einsum_grad");
     retv->SetInput("Operands", this->Input("Operands"));
+    retv->SetInput("InnerCache", this->Output("InnerCache"));
     retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     retv->SetAttrMap(this->Attrs());
     retv->SetOutput(framework::GradVarName("Operands"),
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index c88c2d6f60f10..1ec804d1bf822 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -401,7 +401,8 @@ void EighInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out) {
+                     MetaTensor* out,
+                     std::vector<MetaTensor*> inner_cache) {
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 58b256dc66ee2..25ea003f58fd9 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -82,7 +82,8 @@ void EighInferMeta(const MetaTensor& x,
 
 void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
                      const std::string& equation,
-                     MetaTensor* out);
+                     MetaTensor* out,
+                     std::vector<MetaTensor*> inner_cache);
 
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc
index 3e25a65526d89..8968542b3e0b8 100644
--- a/paddle/phi/kernels/cpu/einsum_kernel.cc
+++ b/paddle/phi/kernels/cpu/einsum_kernel.cc
@@ -17,4 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum, CPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
+PD_REGISTER_KERNEL(
+    einsum, CPU, ALL_LAYOUT, phi::EinsumKernelRaw, float, double) {}
diff --git a/paddle/phi/kernels/einsum_grad_kernel.h b/paddle/phi/kernels/einsum_grad_kernel.h
index 5c1970e775825..06785c8532e70 100644
--- a/paddle/phi/kernels/einsum_grad_kernel.h
+++ b/paddle/phi/kernels/einsum_grad_kernel.h
@@ -21,6 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void EinsumGradKernel(const Context& dev_ctx,
                       const std::vector<const DenseTensor*>& x,
+                      const std::vector<const DenseTensor*>& inner_cache,
                       const DenseTensor& out_grad,
                       const std::string& equation,
                       std::vector<DenseTensor*> x_grad);
diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h
index 3d9e8feda748d..87df2b1c64a4a 100644
--- a/paddle/phi/kernels/einsum_kernel.h
+++ b/paddle/phi/kernels/einsum_kernel.h
@@ -24,4 +24,11 @@ void EinsumKernel(const Context& dev_ctx,
                   const std::string& equation,
                   DenseTensor* out);
 
+template <typename T, typename Context>
+void EinsumKernelRaw(const Context& dev_ctx,
+                     const std::vector<const DenseTensor*>& inputs,
+                     const std::string& equation,
+                     DenseTensor* out,
+                     std::vector<DenseTensor*> cache);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index c8a8745f34522..6ca8dbd9205d8 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -18,5 +18,10 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_grad_impl.h"
 
-PD_REGISTER_KERNEL(
-    einsum_grad, GPU, ALL_LAYOUT, phi::EinsumGradKernel, float, double) {}
+PD_REGISTER_KERNEL(einsum_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu
index d73e154eb40f7..d1f4c6590387a 100644
--- a/paddle/phi/kernels/gpu/einsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_kernel.cu
@@ -18,4 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 
-PD_REGISTER_KERNEL(einsum, GPU, ALL_LAYOUT, phi::EinsumKernel, float, double) {}
+PD_REGISTER_KERNEL(einsum,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::EinsumKernelRaw,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index 2b087f8dcae09..aceb97a49b1c2 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
 #include "paddle/phi/kernels/tile_kernel.h"
@@ -55,7 +56,13 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
   }
   t.Resize(make_ddim(resize_dims));
   DenseTensor after_tile;
-  TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  if (std::all_of(repeat_times.begin(), repeat_times.end(), [](int x) {
+        return x == 1;
+      })) {
+    after_tile = t;
+  } else {
+    TileKernel<T, Context>(dev_ctx, t, repeat_times, &after_tile);
+  }
   size_t n_ellipsis_idx = op_label.find(".", 0);
   if (n_ellipsis_idx != std::string::npos) {
     // may be we need reduce. broadcast_dims is not equal to ellipsis dims.
@@ -91,10 +98,11 @@ DenseTensor PerformTileAndReduction(const Context& dev_ctx,
 template <typename T, typename Context>
 void EinsumGradKernel(const Context& dev_ctx,
                       const std::vector<const DenseTensor*>& x,
+                      const std::vector<const DenseTensor*>& inner_cache,
                       const DenseTensor& out_grad,
                       const std::string& equation,
                       std::vector<DenseTensor*> x_grad) {
-  VLOG(5) << "Start EisumGradKernel:";
+  VLOG(5) << "Start EinsumGradKernel:";
   LabelMap labelshape(0);
   LabelMap labeltype(LabelType::Reduction);
   std::vector<LabelMap> label2perms(x.size(), LabelMap(-1));
@@ -162,8 +170,33 @@ void EinsumGradKernel(const Context& dev_ctx,
     operands_for_B.push_back(x[0]);
 
     DenseTensor before_tile;
-    EinsumKernel<T, Context>(dev_ctx, operands_for_A, equation_for_A, &dA);
-    EinsumKernel<T, Context>(dev_ctx, operands_for_B, equation_for_B, &dB);
+
+    std::vector<DenseTensor> cache(3);  // set empty; TA, TB, TdC
+    if (inner_cache.size() >
+        0) {  // for compatibility,  we can load and run v2.3 EinsumOp.
+      cache[0].ShareBufferWith(*(inner_cache[0]));
+      cache[1].ShareBufferWith(*(inner_cache[1]));
+    }
+
+    EinsumKernelImpl<T, Context>(dev_ctx,
+                                 all_labels,
+                                 operands_for_A,
+                                 equation_for_A,
+                                 &dA,
+                                 {&cache[1], &cache[2]},
+                                 false);
+
+    EinsumKernelImpl<T, Context>(dev_ctx,
+                                 all_labels,
+                                 operands_for_B,
+                                 equation_for_B,
+                                 &dB,
+                                 {&cache[2], &cache[0]},
+                                 false);
+
+    // release the cache tensor dTC to save memory right now. they are useless
+    // now.
+    cache.clear();
     *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
                                                        labeltype,
                                                        labelshape,
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 901147734b29f..5e4480426c0cc 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -137,7 +137,6 @@ inline std::vector<char> TransformLabelsOrder(
     std::vector<char> tmp;
     for (int c : all_labels) {
       if (type[c] == cnt_type) tmp.push_back(c);
-      std::sort(tmp.begin(), tmp.end());
     }
     ret.insert(ret.end(), tmp.begin(), tmp.end());
   }
@@ -176,6 +175,15 @@ inline static void GlobalInfo(const std::vector<std::string>& op_labels,
 
   (*label2type)['.'] = LabelType::Batch;
 
+  if (sorted_labels->size()) {
+    std::set<char> exist(all.begin(), all.end());
+    all.clear();
+    std::for_each(
+        sorted_labels->begin(), sorted_labels->end(), [&exist, &all](char c) {
+          if (exist.count(c)) all.push_back(c);
+        });
+  }
+
   *sorted_labels = TransformLabelsOrder(all,
                                         *label2type,
                                         {LabelType::Batch,
@@ -409,7 +417,8 @@ DenseTensor PerformContraction(
     const LabelMap& label2shape,
     const std::vector<std::vector<int>>& ellipsis_dims,
     const std::vector<int>& broadcast_dims,
-    std::vector<DenseTensor*> cache) {
+    std::vector<DenseTensor*> cache,
+    bool use_cache) {
   // Get All the Batches, so perm is
   auto all_valid = LabelMap(1);
   auto recover_dim = GetShapeByType<int>(all_labels,
@@ -447,14 +456,17 @@ DenseTensor PerformContraction(
     }
     // reduction
     DenseTensor trans_t;
-    if (cache[operand_idx]->IsInitialized()) {
+    if (use_cache && cache[operand_idx] != nullptr &&
+        cache[operand_idx]->IsInitialized()) {
       trans_t.ShareBufferWith(*(cache[operand_idx]));
+      VLOG(5) << "Cache Used!";
     } else {
       auto reduct_t = PerformReduction<T, Context>(
           dev_ctx, t, perm, all_labels, ellipsis, label2type);
       trans_t = PerformTranspose<T, Context>(
           dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
-      cache[operand_idx]->ShareBufferWith(trans_t);
+      if (cache[operand_idx] != nullptr)
+        cache[operand_idx]->ShareBufferWith(trans_t);
     }
     auto mul_dims = GetShapeByType<int>(all_labels,
                                         label2type,
@@ -515,18 +527,23 @@ void TransposeToOutput(const Context& dev_ctx,
       axis.push_back(it - all_labels.begin() + offset);
     }
   }
-  if (is_no_need_transpose(axis)) return output->ShareBufferWith(to_trans);
+  if (is_no_need_transpose(axis)) {
+    output->ShareBufferWith(to_trans);
+    return;
+  }
   VLOG(5) << "call TransposeToOutput: with axis: "
           << paddle::string::join_strings(axis, ",");
-  return TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
+  TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
 }
 
 template <typename T, typename Context>
 void EinsumKernelImpl(const Context& dev_ctx,
+                      const std::vector<char>& forward_all_labels,
                       const std::vector<const DenseTensor*>& inputs,
                       const std::string& equation,
                       DenseTensor* out,
-                      std::vector<DenseTensor*> cache) {
+                      std::vector<DenseTensor*> cache,
+                      bool is_forward = true) {
   ValidationCheck(equation);
   // collect the following informations to prepare einsum.
   LabelMap labelshape(0);
@@ -542,6 +559,9 @@ void EinsumKernelImpl(const Context& dev_ctx,
     input_dims.push_back(i->dims());
   }
   std::string right;
+  if (!is_forward) {
+    all_labels = forward_all_labels;
+  }
   ParseEinsumEquation(equation,
                       input_dims,
                       &labelshape,
@@ -557,7 +577,6 @@ void EinsumKernelImpl(const Context& dev_ctx,
     auto& A = inputs[0];
     auto& B = inputs[1];
     // Reduction and Contract Procedure
-    dev_ctx.template Alloc<T>(out);
     auto after_contraction = PerformContraction<T, Context>(dev_ctx,
                                                             *A,
                                                             *B,
@@ -567,7 +586,8 @@ void EinsumKernelImpl(const Context& dev_ctx,
                                                             labelshape,
                                                             ellipsis_dims,
                                                             broadcast_dims,
-                                                            cache);
+                                                            cache,
+                                                            !is_forward);
     TransposeToOutput<T, Context>(dev_ctx,
                                   after_contraction,
                                   right,
@@ -599,18 +619,37 @@ void EinsumKernelImpl(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void EinsumKernelRaw(const Context& dev_ctx,
+                     const std::vector<const DenseTensor*>& inputs,
+                     const std::string& equation,
+                     DenseTensor* out,
+                     std::vector<DenseTensor*> cache) {
+  std::vector<char> tmp;
+  // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output
+  // may have nullptr and the cache.size() is not equal to inputs.size(). refer
+  // to BuildPhiKernelContext for details.
+  int diff = inputs.size() - cache.size();
+  for (int i = 0; i < diff; ++i) {
+    cache.push_back(nullptr);
+  }
+  EinsumKernelImpl<T, Context>(
+      dev_ctx, tmp, inputs, equation, out, cache, /*forward=*/true);
+}
+
 template <typename T, typename Context>
 void EinsumKernel(const Context& dev_ctx,
                   const std::vector<const DenseTensor*>& inputs,
                   const std::string& equation,
                   DenseTensor* out) {
-  std::vector<DenseTensor> cache(inputs.size());  // set empty; TA, TB, TdC
+  std::vector<char> place_holder;
   std::vector<DenseTensor*> cache_tensor(
       inputs.size());  // set empty; TA, TB, TdC
   for (size_t i = 0; i < inputs.size(); ++i) {
-    cache_tensor[i] = &cache[i];
+    cache_tensor[i] = nullptr;
   }
-  EinsumKernelImpl<T, Context>(dev_ctx, inputs, equation, out, cache_tensor);
+  EinsumKernelImpl<T, Context>(
+      dev_ctx, place_holder, inputs, equation, out, cache_tensor, true);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc
index 0b3cc3425df45..5e45bcf97ce0e 100644
--- a/paddle/phi/ops/compat/einsum_sig.cc
+++ b/paddle/phi/ops/compat/einsum_sig.cc
@@ -17,14 +17,15 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("einsum", {"Operands"}, {"equation"}, {"Out"});
+  return KernelSignature(
+      "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"});
 }
 
 KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("einsum_grad",
-                         {"Operands", {"Out@GRAD"}},
+                         {"Operands", "InnerCache", "Out@GRAD"},
                          {"equation"},
-                         {{"Operands@GRAD"}});
+                         {"Operands@GRAD"});
 }
 }  // namespace phi
 
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py
index 565e43214ea32..1a4ae54afefe2 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py
@@ -34,7 +34,11 @@ def setUp(self):
             self.operands.append(("x" + str(idx), inp))
         self.inputs = {"Operands": self.operands}
         self.attrs = {"equation": self.equation}
-        self.outputs = {'Out': out}
+        self.outputs = {
+            'Out': out,
+            "InnerCache": [('cache_' + str(i), np.array([1.0]))
+                           for i in range(len(self.operands))]
+        }
 
     def init_input(self):
         self.inputs = []
@@ -49,7 +53,7 @@ def set_mandatory(self):
 
     def test_check_output(self):
         if not self.disable:
-            self.check_output()
+            self.check_output(no_check_set=["InnerCache"])
 
     def test_grad(self):
         if not self.disable:
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 23bbc377cae27..ea3264ba0dbb7 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -35,4 +35,5 @@
     'eigh',
     'eigvalsh',
     'class_center_sample',
+    'einsum',
 ]
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 713a611f9f39a..4cdbebb055229 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -798,11 +798,12 @@ def gen_einsum_op(equation, *operands):
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
-        return _C_ops.final_state_einsum(operands, equation)
+        return _C_ops.final_state_einsum(operands, equation)[0]
 
     if _in_legacy_dygraph():
         # dygraph
-        return _C_ops.einsum(operands, 'equation', equation)
+        return _C_ops.einsum(operands, len(operands), 'equation', equation)[0]
+
     # static graph 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -811,11 +812,16 @@ def gen_einsum_op(equation, *operands):
     out = helper.create_variable_for_type_inference(dtype=operands[0].dtype)
     attrs = dict()
     attrs['equation'] = equation
+    caches = [
+        helper.create_variable_for_type_inference(dtype=operands[0].dtype)
+        for i in range(len(operands))
+    ]
     helper.append_op(
         type='einsum',
         inputs={'Operands': operands},
-        outputs={'Out': out},
-        attrs=attrs, )
+        outputs={'Out': out,
+                 "InnerCache": caches},
+        attrs=attrs)
     return out
 
 
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index f9e0efa59500d..c541891662864 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -585,7 +585,7 @@
 
 - api : einsum
   args : (Tensor[] x, str equation)
-  output : Tensor
+  output : Tensor, Tensor[]{x.size()}
   infer_meta :
     func : EinsumInferMeta
     param : [x, equation]
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index ac9a431593776..146925ccef6d5 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -224,16 +224,18 @@ def parse_output_item(output_item):
 
         if len(temp_list) == 1:
             out_type, out_name, size_expr = parse_output_item(temp_list[0])
-            return [out_type], [out_name], size_expr
+            return [out_type], [out_name], [size_expr]
         else:
             out_type_list = []
             out_name_list = []
+            out_size_expr_list = []
             for output_item in temp_list:
                 out_type, out_name, size_expr = parse_output_item(output_item)
                 out_type_list.append(out_type)
                 out_name_list.append(out_name)
+                out_size_expr_list.append(size_expr)
 
-            return out_type_list, out_name_list, size_expr
+            return out_type_list, out_name_list, out_size_expr_list
 
     def parse_infer_meta(self, infer_meta_config):
         infer_meta = infer_meta_config
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 4e98985c9b111..c0923adf39c46 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -111,10 +111,10 @@ def gene_output(self,
 {code_indent}  {return_type} api_output{inplace_assign};"""
 
             if return_type == 'std::vector<Tensor>':
-                assert self.outputs['out_size_expr'] is not None, \
+                assert self.outputs['out_size_expr'][0] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr'][0]}, kernel_backend, &api_output);"""
 
             else:
                 output_create = output_create + f"""
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index eb00e2e615f67..81c211e640735 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -552,8 +552,8 @@
     skip_transform : out_w, out_w_grad
 
 - backward_api : einsum_grad
-  forward : einsum (Tensor[] x, str equation) -> Tensor(out)
-  args : (Tensor[] x, Tensor out_grad, str equation)
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache)
+  args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation)
   output : Tensor[](x_grad){x.size()}
   infer_meta :
     func : UnchangedMultiInferMeta

From cbb241369f21d4002289649d0ea242d429b86c2b Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Wed, 25 May 2022 11:11:50 +0800
Subject: [PATCH 033/109] [MLU] adapt coalesce_tensor op for mlu (#42873)

---
 paddle/fluid/operators/coalesce_tensor_op.cc  |  19 +++
 .../fluid/platform/device_memory_aligment.cc  |   4 +-
 .../fluid/platform/device_memory_aligment.h   |   3 +
 .../mlu/test_coalesce_tensor_op_mlu.py        | 109 ++++++++++++++++++
 4 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py

diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 900fd4d8d292e..aa5a38e4dbf08 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -24,6 +24,9 @@
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -69,6 +72,13 @@ struct FillConstantVisitor {
       phi::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
     }
+#elif defined(PADDLE_WITH_MLU)
+    if (platform::is_mlu_place(context_.GetPlace())) {
+      FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
+    } else {
+      phi::funcs::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
+    }
 #else
     phi::funcs::SetConstant<DeviceContext, T> set_constant;
     set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
@@ -509,6 +519,15 @@ REGISTER_OP_NPU_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
 #endif
 
+#if defined(PADDLE_WITH_MLU)
+REGISTER_OP_MLU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index 8261c866d073d..e8a6051c19f2d 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -31,9 +31,11 @@ size_t Alignment(size_t size, const platform::Place &place, int align_size) {
       alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
       alignment = NPUMinChunkSize();
+#elif defined(PADDLE_WITH_MLU)
+      alignment = MLUMinChunkSize();
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "Fluid is not compiled with CUDA/XPU/NPU."));
+          "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index a3f88592b7649..ee37b93807eaa 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -21,6 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
 
 namespace paddle {
 namespace platform {
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
new file mode 100644
index 0000000000000..854ac0b6826cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_coalesce_tensor_op_mlu.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest
+from paddle.fluid import core
+import paddle
+
+alignment = 256
+paddle.enable_static()
+
+
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.op_type = "coalesce_tensor"
+        self.dtype, self.fluid_dtype = self.init_dtype()
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.Outputs, self.FusedOutput = self.init_output(
+            self.Inputs, self.set_constant, self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        return np.float32, int(core.VarDesc.VarType.FP32)
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
+        inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
+        inputs.append(("x4", np.random.random([200, 30]).astype(self.dtype)))
+        inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
+        inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {
+            "copy_data": True,
+            "set_constant": False,
+            "constant": 0.0,
+            "dtype": self.fluid_dtype
+        }
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = []
+        outputs = input_list
+
+        for input in input_list:
+            length = len(input[1].flatten())
+            aligned_len = (length + alignment) / alignment * alignment
+            out = np.zeros(int(aligned_len))
+            out[0:length] = input[1].flatten()
+            inputs.append(out)
+
+        coalesce_tensor_var = np.concatenate([input for input in inputs])
+        if set_constant:
+            coalesce_tensor_var = np.ones((len(coalesce_tensor_var))) * constant
+            outputs = [(out[0],
+                        np.ones(out[1].shape).astype(self.dtype) * constant)
+                       for out in outputs]
+        return outputs, coalesce_tensor_var
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.device.MLUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5)
+
+
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {
+            "copy_data": False,
+            "set_constant": True,
+            "constant": 5,
+            "dtype": self.fluid_dtype,
+            "user_defined_size_of_dtype": 2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.device.MLUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()

From b685905474cd8c114b02787da7ebb9237d5b41ee Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Wed, 25 May 2022 11:12:33 +0800
Subject: [PATCH 034/109] fix compile error on Loongson CPU, test=develop
 (#42953)

---
 cmake/cblas.cmake                  | 7 ++++---
 paddle/fluid/framework/io/shell.cc | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 69e66407580b6..43c2208182a55 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -65,11 +65,13 @@ if(NOT DEFINED CBLAS_PROVIDER)
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
   find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
     PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  find_path(OPENBLAS_CONFIG_INC_DIR NAMES openblas_config.h
+    PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
   find_library(OPENBLAS_LIB NAMES openblas
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
-  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
+  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_CONFIG_INC_DIR AND OPENBLAS_LIB)
+    file(READ "${OPENBLAS_CONFIG_INC_DIR}/openblas_config.h" config_file)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
     
@@ -138,4 +140,3 @@ if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
 elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
   target_link_libraries(cblas ${CBLAS_LIBRARIES})
 endif()
-
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index f01894f2cf448..361153de7d73a 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -69,7 +69,7 @@ static int close_open_fds_internal() {
 
   for (;;) {
     int bytes = 0;
-    if ((bytes = syscall(SYS_getdents, dir_fd,
+    if ((bytes = syscall(SYS_getdents64, dir_fd,
                          reinterpret_cast<linux_dirent*>(buffer),
                          sizeof(buffer))) < 0) {
       PADDLE_THROW(platform::errors::Unavailable(

From 45d7a3ea304090bd5cd0910450c8ffa7aee771a6 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 25 May 2022 12:32:07 +0800
Subject: [PATCH 035/109] [GPUPS]fix gpups pscore (#42967)

---
 paddle/fluid/framework/hogwild_worker.cc      |  9 +++-
 paddle/fluid/framework/multi_trainer.cc       | 41 ++++++++++++++-----
 paddle/fluid/framework/trainer.h              |  1 +
 .../distributed/passes/ps_trainer_pass.py     | 13 +++++-
 python/paddle/distributed/ps/the_one_ps.py    |  9 ++--
 5 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index cb33e87f490c2..a7138fd2642a8 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -219,6 +219,10 @@ void HogwildWorker::TrainFiles() {
   device_reader_->Start();
   int cur_batch;
   int batch_cnt = 0;
+
+#if defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_CUDA)
+  platform::SetDeviceId(thread_id_);
+#endif
   while ((cur_batch = device_reader_->Next()) > 0) {
     for (auto &op : ops_) {
       bool need_skip = false;
@@ -244,9 +248,12 @@ void HogwildWorker::TrainFiles() {
     ++batch_cnt;
     PrintFetchVars();
     thread_scope_->DropKids();
+#ifdef PADDLE_WITH_HETERPS
+    dev_ctx_->Wait();
+#endif
   }
   timeline.Pause();
-  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+  VLOG(1) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
           << " seconds, ins_num: " << total_ins_num;
 
   if (need_dump_field_ || need_dump_param_) {
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 7a83fdccc218c..6479f7ae72654 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -148,6 +148,17 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
     }
   }
 #endif
+  for (auto& var : main_program.Block(0).AllVars()) {
+    if (var->Persistable()) {
+      auto it = std::find(need_merge_var_names_.begin(),
+                          need_merge_var_names_.end(), var->Name());
+      if (it == need_merge_var_names_.end() &&
+          var->GetType() != proto::VarType::SELECTED_ROWS) {
+        VLOG(2) << "train param: " << var->Name();
+        trainable_param_.push_back(var->Name());
+      }
+    }
+  }
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -192,18 +203,30 @@ void MultiTrainer::Run() {
 
 #ifdef PADDLE_WITH_HETERPS
 void MultiTrainer::MergeDenseParam() {
-#ifdef PADDLE_WTIH_PSCORE
+#ifdef PADDLE_WITH_PSCORE
   auto communicator = paddle::distributed::Communicator::GetInstance();
-  auto& recv_ctx = communicator->GetRecvCtxMap();
-  Scope* thread_scope = workers_[0]->GetThreadScope();
-  for (auto& iter : recv_ctx) {
-    auto& varnames = iter.second;
-    for (auto& name : varnames) {
+  auto thread_scope = workers_[0]->GetThreadScope();
+  if (communicator == nullptr) {
+    for (auto& name : trainable_param_) {
+      VLOG(2) << "merge var " << name << " to root scope";
       Variable* root_var = root_scope_->FindVar(name);
       LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
       Variable* var = thread_scope->FindVar(name);
       LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+      TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+    }
+  } else {
+    auto& recv_ctx = communicator->GetRecvCtxMap();
+    for (auto& iter : recv_ctx) {
+      auto& varnames = iter.second;
+      for (auto& name : varnames) {
+        VLOG(2) << "merge var " << name << " to root scope";
+        Variable* root_var = root_scope_->FindVar(name);
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        Variable* var = thread_scope->FindVar(name);
+        LoDTensor* tensor = var->GetMutable<LoDTensor>();
+        TensorCopySync((*tensor), root_tensor->place(), root_tensor);
+      }
     }
   }
 #endif
@@ -236,11 +259,7 @@ void MultiTrainer::Finalize() {
     }
     LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
 
-#ifdef PADDLE_WITH_HETERPS
-    for (size_t j = 0; j < places_.size(); j++) {
-#else
     for (int j = 1; j < thread_num_; j++) {
-#endif
       Scope* cur_thread_scope = workers_[j]->GetThreadScope();
       Variable* thread_var =
           cur_thread_scope->FindVar(need_merge_var_names_[i]);
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index b86b4fec8a571..c78f7611b63be 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -129,6 +129,7 @@ class MultiTrainer : public TrainerBase {
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
+  std::vector<std::string> trainable_param_;
 #ifdef PADDLE_WITH_HETERPS
   std::vector<platform::Place> places_;
 #endif
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 76e617c7dafcf..0792a1eddc7fd 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -614,15 +614,24 @@ def _check_conflict(self, other_pass):
         return True
 
     def _add_push_box_sparse_op(self, program):
+        insert_index = -1
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                insert_index = idx
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
             for op_desc in grad_op_desc:
-                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc = program.global_block().desc._insert_op(
+                    insert_index + 1)
                 new_op_desc.copy_from(op_desc)
                 new_op_desc._set_attr(op_role_attr_name, backward)
+                new_op = paddle.fluid.framework.Operator(program.global_block(),
+                                                         new_op_desc)
+                program.global_block().ops.insert(insert_index + 1, new_op)
+                program.global_block()._sync_with_cpp()
 
     def _remove_optimizer_var(self, program):
         embedding_w = {}
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index c6df7559a22e8..888d517116a15 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -1013,12 +1013,13 @@ def sync_strategy_envs():
             if self.context['ps_mode'] == DistributedMode.GEO:
                 self._communicator.init_params(init_params)
             else:
-                if role_id == 0:
-                    self._init_all_params(scopes, send_ctx, dense_map)
+                if not self.context['use_ps_gpu']:
+                    if role_id == 0:
+                        self._init_all_params(scopes, send_ctx, dense_map)
 
             fleet.util.barrier()
-
-        self._pull_all_dense(scopes, send_ctx, dense_map)
+        if not self.context['use_ps_gpu']:
+            self._pull_all_dense(scopes, send_ctx, dense_map)
         fleet.util.barrier()
 
         if self.context['ps_mode'] == DistributedMode.GEO:

From f1f79b0d9d18cebcf8b89775d2b066d6fdd04199 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 25 May 2022 12:32:49 +0800
Subject: [PATCH 036/109] fix maybe-uninitialized warning (#42902)

* fix maybe-uninitialized warning

* fix compile

* fix xpu compile

* fix npu compile

* fix infer compile

* fix compile

* fix compile
---
 cmake/flags.cmake                                      |  1 -
 paddle/fluid/distributed/fleet_executor/dist_model.cc  |  6 +++---
 .../framework/ir/quant_conv2d_dequant_fuse_pass.cc     |  2 +-
 .../framework/ir/transpose_flatten_concat_fuse_pass.cc |  2 +-
 .../inference/analysis/ir_passes/lite_subgraph_pass.cc |  2 +-
 .../fluid/inference/api/analysis_predictor_tester.cc   |  2 +-
 .../inference/tensorrt/convert/multihead_matmul_op.cc  |  2 +-
 paddle/fluid/inference/tensorrt/convert/reduce_op.cc   |  2 +-
 .../fluid/operators/amp/update_loss_scaling_op_npu.cc  |  4 ++--
 .../operators/elementwise/elementwise_op_npu_test.cc   |  4 ++--
 paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc |  2 +-
 paddle/fluid/operators/fused/cudnn_norm_conv_test.cc   |  4 ++--
 .../operators/fused/fusion_repeated_fc_relu_op.cc      | 10 +++++-----
 .../fluid/operators/fused/fusion_seqpool_concat_op.cc  |  6 +++---
 .../operators/fused/fusion_seqpool_cvm_concat_op.cc    |  6 +++---
 paddle/fluid/operators/math/sample_prob.h              |  2 +-
 paddle/fluid/operators/math/selected_rows_functor.cc   | 10 +++++-----
 paddle/fluid/operators/math/softmax_impl.h             |  2 +-
 paddle/fluid/operators/math/tree2col.cc                |  6 +++---
 paddle/fluid/operators/roi_align_op_xpu.cc             |  2 +-
 paddle/phi/kernels/cpu/mv_grad_kernel.cc               |  2 +-
 paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc       |  2 +-
 paddle/phi/kernels/cpu/rnn_functor.h                   |  2 +-
 paddle/phi/kernels/cpu/tril_indices_kernel.cc          |  2 +-
 paddle/phi/kernels/funcs/blas/blas_impl.h              |  6 +++---
 .../kernels/impl/channel_shuffle_grad_kernel_impl.h    |  4 ++--
 paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h  |  4 ++--
 paddle/phi/kernels/impl/mv_kernel_impl.h               |  2 +-
 .../phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h  |  4 ++--
 paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h    |  4 ++--
 .../kernels/impl/pixel_unshuffle_grad_kernel_impl.h    |  4 ++--
 paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h  |  4 ++--
 paddle/phi/kernels/impl/unfold_grad_kernel_impl.h      |  2 +-
 paddle/phi/kernels/impl/unfold_kernel_impl.h           |  2 +-
 paddle/utils/optional.h                                |  4 ++++
 35 files changed, 64 insertions(+), 61 deletions(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index f9cac0579fec4..11f7391ff3dc1 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -147,7 +147,6 @@ set(COMMON_FLAGS
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
     -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2
     -Wimplicit-fallthrough=0 # Warning in tinyformat.h
-    -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2
     ${fsanitize}
 )
 
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index cacd55e02a5e2..d8f937e218be4 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -546,9 +546,9 @@ bool DistModel::Run(const std::vector<DistModelTensor> &input_data,
 
   DistModelTimer timer;
   timer.tic();
-  double feed_elapse;
-  double fleet_exe_elapse;
-  double fetch_elapse;
+  double feed_elapse = 0;
+  double fleet_exe_elapse = 0;
+  double fetch_elapse = 0;
 
   if (!FeedData(input_data, scope_.get())) {
     LOG(ERROR) << "DistModel failed at feeding data.";
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 281e0b9910619..e436bee035cea 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -488,7 +488,7 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope,
     // Convert weight to fp32 range
     auto* weight_tensor =
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
-    auto w_dims = weight_tensor->dims();
+    const auto& w_dims = weight_tensor->dims();
     float* quantized_weight_data =
         weight_tensor->mutable_data<float>(platform::CPUPlace());
     // If quantized op is fc, weight scale size = 1;
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index f3d96c3850656..bda6b90386475 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -93,7 +93,7 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
 
     std::vector<Node *> nodes;
     std::vector<int> trans_axis0;
-    int flatten_axis0;
+    int flatten_axis0 = 0;
     for (int i = 0; i < times; i++) {
       PADDLE_ENFORCE_NOT_NULL(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index a8c29579e12e7..083fc8991192e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -268,7 +268,7 @@ void LiteSubgraphPass::SetUpEngine(
   auto nnadapter_model_cache_token =
       Get<std::vector<std::string>>("nnadapter_model_cache_token");
 
-  lite_api::TargetType target_type;
+  lite_api::TargetType target_type = TARGET(kX86);
   if (use_gpu) {
     target_type = TARGET(kCUDA);
   } else if (use_xpu) {
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index ecb5eaf982548..e8a1384166aff 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -522,7 +522,7 @@ TEST(Tensor, GpuShareExternalData) {
 
   auto out = predictor->GetOutputHandle("fc_1.tmp_2");
   auto out_shape = out->shape();
-  float* out_data;
+  float* out_data = nullptr;
   auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1,
                                   std::multiplies<int>()) *
                   sizeof(float);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 21c79f0edd27f..4b4ad01f5674a 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -56,7 +56,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
            weight_t->numel() * sizeof(float));
 
     // (hidden_in, 3, hidden_out)
-    auto weight_dims = weight_t->dims();
+    const auto& weight_dims = weight_t->dims();
 
     int hidden_in = weight_dims[0];   // channels_in
     int three = weight_dims[1];       // channels_out
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
index 7c5eaa309ef18..13886f55dff01 100644
--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -41,7 +41,7 @@ class ReduceOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a paddle " << op_type << " op to tensorrt reduce layer";
     framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ReduceOperation reduce_type;
+    nvinfer1::ReduceOperation reduce_type = nvinfer1::ReduceOperation::kSUM;
     if (op_type == "reduce_sum") {
       reduce_type = nvinfer1::ReduceOperation::kSUM;
     } else if (op_type == "reduce_mean") {
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 5808841333f08..f9a93a47ff2be 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -161,8 +161,8 @@ class LazyZerosNPU {
     }
     auto place = dev_ctx.GetPlace();
     auto stream = dev_ctx.stream();
-    Tensor* zero_tensor;
-    void* zero_ptr;
+    Tensor* zero_tensor = nullptr;
+    void* zero_ptr = nullptr;
     if (found_inf_vec[0]) {
       int max_num = -1;
       for (size_t i = 0; i < xs.size(); ++i) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 3e9263fe93acd..39a80e9571b29 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -75,7 +75,7 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx,
   paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
 
   ctx.Wait();
-  float expected;
+  float expected = 0.0;
   if (op_type == "elementwise_add") {
     expected = 3.0;
   } else if (op_type == "elementwise_sub") {
@@ -133,7 +133,7 @@ void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
   paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
 
   ctx.Wait();
-  float expected_x, expected_y;
+  float expected_x = 0, expected_y = 0;
   if (op_type == "elementwise_add_grad") {
     expected_x = 1.0;
     expected_y = 6.0;
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index c5adee547bdac..516b10fa021c1 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -103,7 +103,7 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
                             framework::Tensor *cpu_sum,
                             framework::Tensor *cpu_sum_of_square) {
   // x is in NHWC format.
-  auto dims = cpu_x.dims();
+  const auto &dims = cpu_x.dims();
   int64_t c = dims[3];
 
   const T *cpu_x_ptr = cpu_x.data<T>();
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 884fca2c1b0b8..5881322007add 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -51,7 +51,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
 template <typename T>
 void TransposeNchwToNhwc(const framework::Tensor &cpu_in,
                          framework::Tensor *cpu_out) {
-  auto in_dims = cpu_in.dims();
+  const auto &in_dims = cpu_in.dims();
   EXPECT_EQ(cpu_in.dims().size(), 4);
 
   const T *cpu_in_ptr = cpu_in.data<T>();
@@ -184,7 +184,7 @@ template <typename T>
 void ComputeSumAndSquareSum(const framework::Tensor &cpu_out,
                             framework::Tensor *cpu_sum,
                             framework::Tensor *cpu_sum_of_square) {
-  auto dims = cpu_out.dims();
+  const auto &dims = cpu_out.dims();
   int64_t c = dims[3];
 
   const T *cpu_out_ptr = cpu_out.data<T>();
diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
index acb94e20df8cb..bed5125b99583 100644
--- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc
@@ -130,7 +130,7 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
     int weight_sz = static_cast<int>(weights.size());
 
     auto i_dims = in->dims();
-    auto w_dims = weights[0]->dims();
+    const auto& w_dims = weights[0]->dims();
     jit::matmul_attr_t attr;
     attr.m = i_dims[0];
     attr.n = w_dims[1];
@@ -140,8 +140,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
             relus[0]->mutable_data<T>(place), attr);
 
     for (int i = 1; i < weight_sz - 1; ++i) {
-      auto i_dims = relus[i - 1]->dims();
-      auto w_dims = weights[i]->dims();
+      const auto& i_dims = relus[i - 1]->dims();
+      const auto& w_dims = weights[i]->dims();
       attr.m = i_dims[0];
       attr.n = w_dims[1];
       attr.k = w_dims[0];
@@ -150,8 +150,8 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel<T> {
               biases[i]->data<T>(), relus[i]->mutable_data<T>(place), attr);
     }
 
-    auto i_dims_last = relus[weight_sz - 2]->dims();
-    auto w_dims_last = weights[weight_sz - 1]->dims();
+    const auto& i_dims_last = relus[weight_sz - 2]->dims();
+    const auto& w_dims_last = weights[weight_sz - 1]->dims();
     attr.m = i_dims_last[0];
     attr.n = w_dims_last[1];
     attr.k = w_dims_last[0];
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
index 91bc855d43c83..e574d67e3982c 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc
@@ -91,8 +91,8 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<LoDTensor>("Out");
     std::string pooltype = ctx.Attr<std::string>("pooltype");
     auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
+    const auto& x0_dims = ins[0]->dims();
+    const auto& y_dims = out->dims();
     size_t bs = x0_lod[0].size() - 1;
     out->Resize({static_cast<int64_t>(bs), y_dims[1]});
     framework::LoD y_lod(1);
@@ -122,7 +122,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel<T> {
     size_t n = ins.size();
     size_t dst_step_size = n * w;
     for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
+      const auto& x_dims = ins[i]->dims();
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index 123c4c885ead8..c74cc504840d3 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -92,8 +92,8 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<LoDTensor>("Out");
     std::string pooltype = ctx.Attr<std::string>("pooltype");
     auto x0_lod = ins[0]->lod();
-    auto x0_dims = ins[0]->dims();
-    auto y_dims = out->dims();
+    const auto& x0_dims = ins[0]->dims();
+    const auto& y_dims = out->dims();
     size_t bs = x0_lod[0].size() - 1;
     out->Resize({static_cast<int64_t>(bs), y_dims[1]});
     framework::LoD y_lod(1);
@@ -121,7 +121,7 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
     size_t n = ins.size();
     size_t dst_step_size = n * w;
     for (size_t i = 0; i < n; ++i) {
-      auto x_dims = ins[i]->dims();
+      const auto& x_dims = ins[i]->dims();
       auto x_lod = ins[i]->lod()[0];
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 09daf0afe18bf..18a86d1531724 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -52,7 +52,7 @@ class SampleWithProb {
                   const std::size_t num_samples, const Tensor* L, Tensor* S,
                   Tensor* P) {
     // UNDERSTAND: dimension issues
-    const auto lbl_dim = L->dims();
+    const auto& lbl_dim = L->dims();
     const int batch_size = lbl_dim[0];
     const int num_true = lbl_dim[1];
     const int num_sampled_classes = num_true + num_samples;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index a880afb0e9be3..e4b033b6c5857 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -98,8 +98,8 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
                   const phi::SelectedRows& input1,
                   const framework::Tensor& input2, framework::Tensor* output) {
     auto in1_height = input1.height();
-    auto in2_dims = input2.dims();
-    auto out_dims = output->dims();
+    const auto& in2_dims = input2.dims();
+    const auto& out_dims = output->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
@@ -249,7 +249,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
       return;
     }
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
@@ -289,7 +289,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
       return;
     }
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
@@ -838,7 +838,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                   const ScatterOps& op, const phi::SelectedRows& input1,
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
+    const auto& in2_dims = input2->dims();
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 9833b4447ec45..69642c8194221 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -231,7 +231,7 @@ class SoftmaxFunctor<DeviceContext, T, is_test, enable_if_CPU<DeviceContext>> {
  public:
   void operator()(const DeviceContext& context, const int axis_dim,
                   const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
+    const auto& in_dims = X->dims();
     constexpr int kBatchDim = 0;
     constexpr int kClassDim = 1;
 
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index c07582c84acb9..cd1fa13001ce2 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -53,7 +53,7 @@ std::vector<TreeNode> Tree2ColUtil::construct_patch(
 void Tree2ColUtil::construct_tree(const framework::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
-  auto edge_set_dims = EdgeSet.dims();
+  const auto &edge_set_dims = EdgeSet.dims();
   PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
                     platform::errors::InvalidArgument(
                         "The second dimension of the EdgeSet shall be 2, but "
@@ -89,7 +89,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor &node_features,
                   framework::Tensor *patch, int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto feature_dims = node_features.dims();
+    const auto &feature_dims = node_features.dims();
     auto cpu_place = context.GetPlace();
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t feature_size = feature_dims[1];
@@ -142,7 +142,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor &out_grad, framework::Tensor *in_grad,
                   int max_depth) {
     std::vector<std::vector<int>> tr;
-    auto output_dims = out_grad.dims();
+    const auto &output_dims = out_grad.dims();
     auto cpu_place = context.GetPlace();
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t output_size = output_dims[1];
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 13490d6fcde3a..7be1c19012099 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -37,7 +37,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
     auto aligned = ctx.Attr<bool>("aligned");
 
-    auto in_dims = in->dims();
+    const auto& in_dims = in->dims();
     int batch_size = in_dims[0];
     int channels = in_dims[1];
     int height = in_dims[2];
diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
index c3b7f94be4194..c87801bb69389 100644
--- a/paddle/phi/kernels/cpu/mv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc
@@ -31,7 +31,7 @@ void MvGradKernel(const Context& dev_ctx,
   auto dx = x_grad;
   auto dvec = vec_grad;
 
-  auto dim_x = x.dims();
+  const auto& dim_x = x.dims();
   int m = dim_x[0];
   int n = dim_x[1];
 
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index fbed3f1cb133a..715e6b008ed77 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -32,7 +32,7 @@ void PsroiPoolGradKernel(const Context& ctx,
                          float spatial_scale,
                          DenseTensor* dx) {
   if (dx) {
-    auto in_dims = x.dims();
+    const auto& in_dims = x.dims();
     int input_channels = in_dims[1];
     int height = in_dims[2];
     int width = in_dims[3];
diff --git a/paddle/phi/kernels/cpu/rnn_functor.h b/paddle/phi/kernels/cpu/rnn_functor.h
index 961bc7a214be5..ab6f98ffcd5d6 100644
--- a/paddle/phi/kernels/cpu/rnn_functor.h
+++ b/paddle/phi/kernels/cpu/rnn_functor.h
@@ -330,7 +330,7 @@ void RnnFunc(const Context& dev_ctx,
     }
   }
 
-  DenseTensor* input_holder;
+  DenseTensor* input_holder = nullptr;
   DenseTensor* output_holder = output;
   bool has_allocate_mem = false;
 
diff --git a/paddle/phi/kernels/cpu/tril_indices_kernel.cc b/paddle/phi/kernels/cpu/tril_indices_kernel.cc
index c515a69f011d5..71c5cd820b383 100644
--- a/paddle/phi/kernels/cpu/tril_indices_kernel.cc
+++ b/paddle/phi/kernels/cpu/tril_indices_kernel.cc
@@ -26,7 +26,7 @@ void TrilIndicesKernel(const Context& dev_ctx,
                        DataType dtype,
                        DenseTensor* out) {
   T* out_data = dev_ctx.template Alloc<T>(out);
-  auto out_dims = out->dims();
+  const auto& out_dims = out->dims();
   int64_t tril_size = out_dims[1];
   int64_t i = 0;
   T r = std::max<int64_t>(0, -offset), c = 0;
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index 2868aa5acb75e..db4796b3f61ca 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1284,9 +1284,9 @@ void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
                                  T alpha,
                                  phi::DenseTensor *mat_out,
                                  T beta) const {
-  auto dim_a = mat_a.dims();
-  auto dim_b = mat_b.dims();
-  auto dim_out = mat_out->dims();
+  const auto &dim_a = mat_a.dims();
+  const auto &dim_b = mat_b.dims();
+  const auto &dim_out = mat_out->dims();
   PADDLE_ENFORCE_EQ(
       dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
       true,
diff --git a/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
index 26bee763eca52..b8406b9143103 100644
--- a/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/channel_shuffle_grad_kernel_impl.h
@@ -32,8 +32,8 @@ void ChannelShuffleGradKernel(const Context& dev_ctx,
   auto* dx = x_grad;
   dev_ctx.template Alloc<T>(dx);
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
index c723cd3622af9..7e31e02851591 100644
--- a/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/channel_shuffle_kernel_impl.h
@@ -31,8 +31,8 @@ void ChannelShuffleKernel(const Context& dev_ctx,
   auto* in = &x;
   dev_ctx.template Alloc<T>(out);
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h
index 1754ea323ceb9..4baee25a0993a 100644
--- a/paddle/phi/kernels/impl/mv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/mv_kernel_impl.h
@@ -23,7 +23,7 @@ void MvKernel(const Context& dev_ctx,
               const DenseTensor& x,
               const DenseTensor& vec,
               DenseTensor* out) {
-  auto dim_x = x.dims();
+  const auto& dim_x = x.dims();
 
   // get data ptr
   const T* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
index db19a04337932..f71f6cd990aa1 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h
@@ -32,8 +32,8 @@ void PixelShuffleGradKernel(const Context& ctx,
   ctx.template Alloc<T>(dx);
   int factor = upscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
index 2303db4ea57d6..c5e41b4902951 100644
--- a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h
@@ -31,8 +31,8 @@ void PixelShuffleKernel(const Context& ctx,
   ctx.template Alloc<T>(out);
   int factor = upscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
index cb02539f2e890..399c6a56727e2 100644
--- a/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_grad_kernel_impl.h
@@ -33,8 +33,8 @@ void PixelUnshuffleGradKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(dx);
   int factor = downscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto do_dims = dout->dims();
-  auto dx_dims = dx->dims();
+  const auto& do_dims = dout->dims();
+  const auto& dx_dims = dx->dims();
 
   DenseTensor t(*dout);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
index 0a140b270ba1b..7ffce62dacf65 100644
--- a/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pixel_unshuffle_kernel_impl.h
@@ -32,8 +32,8 @@ void PixelUnshuffleKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
   int factor = downscale_factor;
   bool channel_last = (data_format == "NHWC");
-  auto in_dims = in->dims();
-  auto o_dims = out->dims();
+  const auto& in_dims = in->dims();
+  const auto& o_dims = out->dims();
 
   DenseTensor t(*in);
   if (!channel_last) {
diff --git a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
index 5556654ee7c0d..0724cffdd4448 100644
--- a/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_grad_kernel_impl.h
@@ -35,7 +35,7 @@ void UnfoldGradKernel(const Context& ctx,
 
   if (!x_grad) return;
 
-  auto x_dims = x_grad->dims();
+  const auto& x_dims = x_grad->dims();
   const int batch_size = static_cast<int>(x_dims[0]);
 
   int out_height = phi::funcs::CalcOutputSize(x_dims[2],
diff --git a/paddle/phi/kernels/impl/unfold_kernel_impl.h b/paddle/phi/kernels/impl/unfold_kernel_impl.h
index e914f6cacbde9..4526d1c3dcd7d 100644
--- a/paddle/phi/kernels/impl/unfold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unfold_kernel_impl.h
@@ -36,7 +36,7 @@ void UnfoldKernel(const Context& ctx,
   paddle::operators::math::
       Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
           im2col;
-  auto x_dims = x.dims();
+  const auto& x_dims = x.dims();
 
   int out_height = phi::funcs::CalcOutputSize(x_dims[2],
                                               kernel_sizes[0],
diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h
index eec5f32be7226..2b5a657f4d42e 100644
--- a/paddle/utils/optional.h
+++ b/paddle/utils/optional.h
@@ -100,7 +100,11 @@ class reference_content {
  public:  // structors
   ~reference_content() {}
 
+// TODO(zhiqiu): remove it
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
   reference_content(RefT r) : content_(r) {}
+#pragma GCC diagnostic pop
 
   reference_content(const reference_content& operand)
       : content_(operand.content_) {}

From c6f98fa0ec9068ee93eead3beb6cce8a377f1342 Mon Sep 17 00:00:00 2001
From: onecatcn <kaiwang85@qq.com>
Date: Wed, 25 May 2022 16:17:29 +0800
Subject: [PATCH 037/109] fix an bug in metrics.py; test=document_fix (#42976)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR types
Bug fixes

PR changes
Docs

Describe
修复 paddle.metric.accuracy 文档，对应的中文文档修复为 https://github.com/PaddlePaddle/docs/pull/4811
the file was editted based on the discussion in the issue:
INT32 Failed on paddle.metric.accuracy: https://github.com/PaddlePaddle/Paddle/issues/42845
---
 python/paddle/metric/metrics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 118004088da16..d399cb2052498 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -771,7 +771,7 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None):
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
-        label(Tensor): The label of dataset. Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
+        label(Tensor): The label of dataset. Tensor with type int64. The shape is ``[sample_number, 1]`` .
         k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32.
         correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32.
         total(Tensor, optional): The total entries count. A tensor with type int64 or int32.

From 657abd517f3930b37c2a665dc1ef5c8140252504 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Wed, 25 May 2022 14:51:52 +0200
Subject: [PATCH 038/109] OneDNN md-in-tensor refactoring part 4: Memory
 descriptor enabled for more ops (#42946)

* added support for md in more ops

* fixed typo
---
 .../mkldnn/fill_constant_mkldnn_op.cc         |  6 ++-
 .../fluid/operators/mkldnn/lrn_mkldnn_op.cc   |  8 ++--
 .../fluid/operators/mkldnn/slice_mkldnn_op.cc | 13 ++---
 .../fluid/operators/mkldnn/stack_mkldnn_op.cc | 10 ++--
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   | 47 +++++--------------
 5 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
index cfc320da47fff..73e783068379d 100644
--- a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
@@ -79,8 +79,10 @@ class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
                                        {DNNL_ARG_DST, *src0_memory_p}});
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetPlainMKLDNNFormat(out->dims().size()));
+    // src0_memory_p's md was just to allow the usage of a binary
+    // primitive as a memset, and now we need to create a real one
+    out->set_mem_desc({phi::vectorize(shape), platform::MKLDNNGetDataType<T>(),
+                       platform::GetPlainMKLDNNFormat(shape.size())});
   }
 
   T CalculateFillValue(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index d3a36555c389a..245ae2196ca38 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -124,7 +124,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     if (!workspace_memory->get_desc().is_zero()) {
-      mid->set_format(platform::GetMKLDNNFormat(*workspace_memory));
+      mid->set_mem_desc(workspace_memory->get_desc());
       lrn_p->execute(astream, {{DNNL_ARG_SRC, *src_memory},
                                {DNNL_ARG_DST, *dst_memory},
                                {DNNL_ARG_WORKSPACE, *workspace_memory}});
@@ -134,8 +134,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     }
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    out->set_mem_desc(dst_memory->get_desc());
   }
 };
 
@@ -177,8 +176,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                {DNNL_ARG_WORKSPACE, *workspace}});
     astream.wait();
 
-    in_x_grad->set_layout(framework::DataLayout::kMKLDNN);
-    in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory));
+    in_x_grad->set_mem_desc(diff_src_memory->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index 2a8627b803a6e..2df9e5c20fda8 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -175,19 +175,17 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
 
     dnnl::memory::data_type dout_type = framework::ToMKLDNNDataType(
         framework::TransToProtoVarType(dout->dtype()));
-    dnnl::memory::desc md(dout_vec_dims, platform::MKLDNNGetDataType<T>(),
-                          dout->format());
-    dnnl::memory::format_tag reorder_format_tag =
-        platform::GetMKLDNNFormat(md.reshape(slice_dims));
 
     platform::ReorderMKLDNNHandler reorder_handler(
         slice_dims, framework::TransToProtoVarType(dout->dtype()), dout_type,
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        reorder_format_tag, platform::to_void_cast(dout->data<T>()));
+        dout->mem_desc().reshape(slice_dims),
+        platform::to_void_cast(dout->data<T>()));
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        dx, dx_vec_dims, reorder_format_tag, ctx.GetPlace());
+        dx, dx_vec_dims, platform::GetPlainMKLDNNFormat(dx_vec_dims.size()),
+        ctx.GetPlace());
     memset(dx->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
 
     auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
@@ -199,8 +197,7 @@ class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
     reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
     astream.wait();
 
-    dx->set_layout(framework::DataLayout::kMKLDNN);
-    dx->set_format(reorder_format_tag);
+    dx->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
index 36be1681b05e7..28a00be5fa47e 100644
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
@@ -59,7 +59,7 @@ class StackMKLDNNHandler
     // wrong output format deduction and suboptimal performance as a result
     if (stack_axis != ndims) {
       for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format()));
+        srcs_md.push_back(inputs[i]->mem_desc());
       }
 
       input_dims[stack_axis] *= inputs.size();
@@ -69,8 +69,7 @@ class StackMKLDNNHandler
       extended_input_dims[stack_axis] = 1;
 
       for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.emplace_back(memory::desc(input_dims, dt, inputs[i]->format())
-                                 .reshape(extended_input_dims));
+        srcs_md.push_back(inputs[i]->mem_desc().reshape(extended_input_dims));
       }
 
       // concat primitive choses suboptimal format tag because it cannot
@@ -130,9 +129,8 @@ class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     concat_p->execute(astream, args);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(
-        dst_mem->get_desc().reshape(phi::vectorize(output->dims()))));
+    output->set_mem_desc(
+        dst_mem->get_desc().reshape(phi::vectorize(output->dims())));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 99f957f573a17..de21c2687bd44 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -60,17 +60,16 @@ class SumMKLDNNHandler
     auto src_tz = dst_tz;
 
     std::vector<dnnl::memory::desc> srcs_md;
+    srcs_md.reserve(in_vars.size());
     for (size_t i = 0; i < in_vars.size(); i++) {
       auto& input_it = in_vars[i]->Get<framework::LoDTensor>();
       if (input_it.numel() == 0) {
         continue;
       }
-      MKLDNNMemoryFormat input_format = input_it.format();
-      srcs_md.push_back(dnnl::memory::desc(
-          src_tz, platform::MKLDNNGetDataType<T>(), input_format));
+      srcs_md.push_back(input_it.mem_desc());
       ++num_inputs_;
     }
-    std::vector<float> scales(num_inputs_, 1.0);
+    std::vector<float> scales(num_inputs_, 1.0f);
 
     auto dst_md = dnnl::memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
                                      MKLDNNMemoryFormat::any);
@@ -139,47 +138,27 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       ++input_index;
     }
 
-    std::shared_ptr<dnnl::memory> dst_mem = nullptr;
+    std::unordered_map<int, dnnl::memory> args;
+    std::shared_ptr<dnnl::memory> dst_mem;
+
+    for (size_t i = 0; i < srcs_mem.size(); ++i) {
+      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
+    }
+
     if (in_place) {
-      dst_mem = handler.AcquireDstMemory();
-      output->mutable_data<T>(ctx.GetPlace());
+      dst_mem = srcs_mem[0];
     } else {
       dst_mem = handler.AcquireDstMemory(output);
     }
+    args.insert({DNNL_ARG_DST, *dst_mem});
 
     auto sum_p = handler.AcquireForwardPrimitive();
 
-    std::unordered_map<int, dnnl::memory> args;
-    for (size_t i = 0; i < srcs_mem.size(); ++i) {
-      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
-    }
-    args.insert({DNNL_ARG_DST, *dst_mem});
-
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
     sum_p->execute(astream, args);
     astream.wait();
 
-    // For in-place execution which sum does not have we need to fake it
-    // so from oneDNN dst memory we reorder data into input
-    if (in_place) {
-      auto& in_out = in_vars[0]->Get<framework::LoDTensor>();
-      auto output_tz = phi::vectorize<int64_t>(output->dims());
-      platform::ReorderMKLDNNHandler reorder_handler(
-          output_tz, framework::TransToProtoVarType(output->dtype()),
-          framework::ToMKLDNNDataType(
-              framework::TransToProtoVarType(in_out.dtype())),
-          dev_ctx.GetEngine());
-
-      auto target_mem = reorder_handler.AcquireDstMemory(
-          output, in_out.format(), ctx.GetPlace());
-
-      auto reorder_p = reorder_handler.AcquireReorder(target_mem, dst_mem);
-
-      reorder_p->execute(astream, *dst_mem, *target_mem);
-      astream.wait();
-    }
-    output->set_layout(framework::DataLayout::kMKLDNN);
-    output->set_format(platform::GetMKLDNNFormat(*dst_mem));
+    output->set_mem_desc(dst_mem->get_desc());
   }
 };
 

From f70a734f289cd7a81410b94eb959bc2ba9e7ae0e Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Wed, 25 May 2022 21:11:44 +0800
Subject: [PATCH 039/109] fix_multi_int8 (#42977)

---
 paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index a8595d55b31b0..4a5947778056a 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -864,7 +864,7 @@ int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     auto* mul0_op_desc = mul0->Op();
 
     // all mul op has same input.
-    if (multihead_op_desc.HasAttr("Input_scale")) {
+    if (mul0_op_desc->HasAttr("Input_scale")) {
       multihead_op_desc.SetAttr("Input_scale",
                                 mul0_op_desc->GetAttr("Input_scale"));
     }

From 3ee1b99b73ce56550342cb2fdb104f15b13704fb Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 26 May 2022 10:36:29 +0800
Subject: [PATCH 040/109] remove Wno-error=parentheses-equality (#42993)

---
 cmake/flags.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 11f7391ff3dc1..0dbd3bc328314 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -142,7 +142,6 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
-    -Wno-error=parentheses-equality # Warnings in pybind11
     -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
     -Wno-error=terminate  # Warning in PADDLE_ENFORCE
     -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2

From 5b86e190f5143b2c4f7db37bbe7fa08ac5fe5301 Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Thu, 26 May 2022 11:23:55 +0800
Subject: [PATCH 041/109] Use all sitepackages path as the library/include path
 (#42940)

---
 .../custom_kernel/custom_kernel_dot_setup.py      | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 3cef228d14d6e..d52882acfc9ac 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import site
 from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
 from distutils.core import setup, Extension
@@ -42,10 +43,11 @@ def build_extensions(self):
     paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
 # include path
-site_packages_path = get_python_lib()
-paddle_custom_kernel_include = [
-    os.path.join(site_packages_path, 'paddle', 'include'),
-]
+site_packages_path = site.getsitepackages()
+paddle_custom_kernel_include = list(
+    map(lambda path: os.path.join(path, 'paddle', 'include'),
+        site_packages_path))
+
 # include path third_party
 compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
                                         'build/third_party')
@@ -56,9 +58,8 @@ def build_extensions(self):
 ]
 
 # libs path
-paddle_custom_kernel_library_dir = [
-    os.path.join(site_packages_path, 'paddle', 'fluid'),
-]
+paddle_custom_kernel_library_dir = list(
+    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path))
 
 # libs
 libs = [':core_avx.so']

From 52ff3f4869b41e706536711803b664a58c156cf7 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Thu, 26 May 2022 11:30:17 +0800
Subject: [PATCH 042/109] fix pipeline on processgroup (#42989)

---
 python/paddle/distributed/collective.py             |  5 +++++
 .../meta_parallel/pp_utils/p2p_communication.py     | 13 ++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index cd03e55f25f61..5f481bd0dca41 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -403,6 +403,11 @@ def new_group(ranks=None, backend=None):
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
 
+        # TODO(shenliang03): This is a temporary solution to solve the problem of 
+        # hang caused by tcp
+        tmp = paddle.to_tensor([1], dtype="int32")
+        paddle.distributed.all_reduce(tmp, group=group, use_calc_stream=True)
+        paddle.distributed.wait(tmp)
         return group
 
     if not backend:
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index b6698a200e945..de36f8503a651 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -18,6 +18,7 @@
 import numpy as np
 from paddle import _C_ops
 import paddle.fluid.core as core
+from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
 
 _hcg = None
 _use_cache = False
@@ -148,9 +149,15 @@ def set_send_message(self, tensor):
 
 
 def _is_valid_send_recv_partial(tensor, mp_degree):
-    tensor_numel = np.prod(tensor.shape)
-    assert tensor_numel != 0, "can't send/recv zero element"
-    return mp_degree > 1 and tensor_numel % mp_degree == 0
+
+    if _in_legacy_dygraph():
+        tensor_numel = np.prod(tensor.shape)
+        assert tensor_numel != 0, "can't send/recv zero element"
+        return mp_degree > 1 and tensor_numel % mp_degree == 0
+    elif in_dygraph_mode():
+        # TODO(shenliang03) support mp+pp optimizer in future. 
+        # (partial_send/partial_recv/partial_allgather_)
+        return False
 
 
 def send_partial(tensor,

From 8f7f3ac9f2a0209959d0fe3bd8c8f50744f03b64 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Thu, 26 May 2022 12:42:32 +0800
Subject: [PATCH 043/109] [GPUPS]fix dymf gpups pscore (#42991)

---
 paddle/fluid/framework/data_set.cc            |  9 +-
 .../fleet/heter_ps/hashtable_kernel.cu        | 11 ++-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 92 +++++++++++++++++--
 .../distributed/passes/ps_trainer_pass.py     |  6 +-
 .../fleet/parameter_server/ir/trainer_pass.py |  8 +-
 5 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index de563330d68e9..0c762ab2e77e5 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -320,12 +320,11 @@ static int compute_thread_batch_nccl(
   thread_avg_batch_num = static_cast<int>(offset.size() / thr_num);
 #ifdef PADDLE_WITH_GLOO
   auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
-  if (!gloo_wrapper->IsInitialized()) {
-    VLOG(0) << "GLOO is not inited";
-    gloo_wrapper->Init();
-  }
-
   if (gloo_wrapper->Size() > 1) {
+    if (!gloo_wrapper->IsInitialized()) {
+      VLOG(0) << "GLOO is not inited";
+      gloo_wrapper->Init();
+    }
     // adjust batch num per thread for NCCL
     std::vector<int> thread_avg_batch_num_vec(1, thread_avg_batch_num);
     std::vector<int64_t> total_instance_num_vec(1, total_instance_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index f5807d2fd7eb7..6b0141f546c66 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -341,6 +341,8 @@ template class HashTable<unsigned long, paddle::framework::FeatureValue*>;
 template class HashTable<long, int>;
 template class HashTable<unsigned long, int>;
 template class HashTable<unsigned long, unsigned long>;
+template class HashTable<unsigned long, long>;
+template class HashTable<unsigned long, long*>;
 template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
@@ -367,6 +369,8 @@ template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
                                                        cudaStream_t stream);
 template void HashTable<long, unsigned int>::get<cudaStream_t>(
     const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<unsigned long, long>::get<cudaStream_t>(
+    const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -402,10 +406,9 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     const long* d_keys, const unsigned int* d_vals, size_t len,
     cudaStream_t stream);
 
-// template void HashTable<unsigned long,
-// paddle::framework::FeatureValue>::insert<
-//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
-//                  size_t start_index, cudaStream_t stream);
+template void HashTable<unsigned long, long>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const long* d_vals, size_t len,
+    cudaStream_t stream);
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::
     dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 18eec174fe9ce..ac08e37aec1fc 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -28,11 +28,16 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+
 #include <algorithm>
 #include <deque>
 
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
+#if defined(PADDLE_WITH_PSCORE)
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -292,10 +297,10 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 
   auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
                               &fleet_ptr](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
     size_t key_size = local_dim_keys[i][j].size();
     int32_t status = -1;
     int32_t cnt = 0;
+#ifdef PADDLE_WITH_PSLIB
     while (true) {
       auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
           i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
@@ -325,6 +330,38 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
         break;
       }
     }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    while (true) {
+      auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
+          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
+          local_dim_keys[i][j].data(), key_size);
+      bool flag = true;
+
+      tt.wait();
+
+      try {
+        status = tt.get();
+      } catch (const std::future_error& e) {
+        VLOG(0) << "Caught a future_error with code" << e.code()
+                << ", Message:" << e.what();
+      }
+      if (status != 0) {
+        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+        sleep(sleep_seconds_before_fail_exit_);
+        flag = false;
+        cnt++;
+      }
+      if (cnt > 3) {
+        VLOG(0) << "fleet pull sparse failed, retry 3 times";
+        exit(-1);
+      }
+
+      if (flag) {
+        break;
+      }
+    }
+#endif
     if (status != 0) {
       LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
       sleep(300);
@@ -333,7 +370,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
               << local_dim_keys[i][j].size();
     }
-#endif
   };
 
   threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
@@ -369,10 +405,16 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
                                      &local_dim_ptr, &device_dim_keys,
                                      &device_dim_ptr,
                                      &device_dim_mutex](int i, int j) {
-#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::FixedFeatureValue*>> task_ptrs(
+        device_num);
+#endif
     for (size_t k = 0; k < local_dim_keys[i][j].size(); k++) {
       int shard = local_dim_keys[i][j][k] % device_num;
       task_keys[shard].push_back(local_dim_keys[i][j][k]);
@@ -391,7 +433,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       }
       device_dim_mutex[dev][j]->unlock();
     }
-#endif
   };
   auto build_func = [device_num, record_status, &pass_values, &local_keys,
                      &local_ptr, &device_task_keys, &device_task_ptrs](int i) {
@@ -629,12 +670,26 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
       val->lr_g2sum =
           ptr_val[paddle::ps::DownpourCtrDymfAccessor::
                       DownpourCtrDymfFeatureValue::embed_g2sum_index()];
-      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
-
       // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
       ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   mf_dim_index()] = float(mf_dim);
       val->mf_dim = mf_dim;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      paddle::distributed::CtrDymfAccessor accessor;
+      val->delta_score =
+          ptr_val[accessor.common_feature_value.DeltaScoreIndex()];
+      val->show = ptr_val[accessor.common_feature_value.ShowIndex()];
+      val->clk = ptr_val[accessor.common_feature_value.ClickIndex()];
+      val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]);
+      val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()];
+      val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()];
+
+      val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
+
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim);
+      val->mf_dim = mf_dim;
 #endif
       if (dim > 8) {  // CpuPS alreay expand as mf_dim
         val->mf_size = mf_dim + 1;
@@ -802,7 +857,6 @@ void PSGPUWrapper::EndPass() {
                cudaMemcpyDeviceToHost);
 
     CHECK(len == hbm_pool->capacity());
-#ifdef PADDLE_WITH_PSLIB
     uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
     for (size_t i = 0; i < len; ++i) {
       if (device_keys[i] == unuse_key) {
@@ -810,6 +864,7 @@ void PSGPUWrapper::EndPass() {
       }
       size_t offset = i * feature_value_size;
       FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
+#ifdef PADDLE_WITH_PSLIB
       auto* downpour_value =
           (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
       int downpour_value_size = downpour_value->size();
@@ -829,13 +884,32 @@ void PSGPUWrapper::EndPass() {
                   embed_g2sum_index()] = gpu_val->lr_g2sum;
       cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
                   slot_index()] = gpu_val->slot;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      auto* downpour_value =
+          (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr);
+      int downpour_value_size = downpour_value->size();
+      if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
+        downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
+      }
+      float* cpu_val = downpour_value->data();
+
+      paddle::distributed::CtrDymfAccessor accessor;
+      cpu_val[accessor.common_feature_value.DeltaScoreIndex()] =
+          gpu_val->delta_score;
+      cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show;
+      cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk;
+      cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr;
+      cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] =
+          gpu_val->lr_g2sum;
+      cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot;
+#endif
       if (gpu_val->mf_size > 0) {
         for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
           cpu_val[x + 8] = gpu_val->mf[x];
         }
       }
     }
-#endif
     free(test_build_values);
   };
   if (multi_mf_dim_) {
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index 0792a1eddc7fd..6112a9a1f45b6 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -375,12 +375,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if attrs['use_ps_gpu']:
                     _program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -679,7 +679,7 @@ def _remove_lookup_table_grad_op_and_var(self, program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 2c09abac9e7ba..51e89cc301cf3 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -293,12 +293,12 @@ def dag_check_up_and_reorder(program, inputs, outputs):
                 if use_ps_gpu:
                     program.global_block()._insert_op(
                         index=distributed_idx,
-                        type="pull_box_sparse",
+                        type="pull_gpups_sparse",
                         inputs={"Ids": inputs,
                                 'W': w},
                         outputs={"Out": outputs},
                         attrs={
-                            "size": w.shape[1],
+                            "size": [w.shape[1] for i in inputs],
                             "is_distributed": True,
                             "is_sparse": True
                         })
@@ -576,7 +576,7 @@ def _add_push_box_sparse_op(program):
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
         backward = core.op_proto_and_checker_maker.OpRole.Backward
         for op in program.global_block().ops:
-            if op.type != "pull_box_sparse":
+            if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse":
                 continue
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(set()), [])
@@ -599,7 +599,7 @@ def _remove_lookup_table_grad_op_and_var(program):
                     lookup_table_grad_var[name] = 1
 
         for idx, op in list(enumerate(program.global_block().ops)):
-            if op.type == "pull_box_sparse":
+            if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse":
                 continue
             for key_name in op.input_names:
                 for var in op.input(key_name):

From cc272afb7e4ffde063a2876b3b13deeda9c45310 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 26 May 2022 14:21:52 +0800
Subject: [PATCH 044/109] [Phi]Refactor InstanceNormKernel and
 InstanceNormGradKernel (#42978)

* move instance_norm

* change mutable_data

* fix compile bugs
---
 paddle/fluid/operators/instance_norm_op.cc    | 220 ----------
 paddle/fluid/operators/instance_norm_op.cu    | 384 ------------------
 .../kernels/cpu/instance_norm_grad_kernel.cc  | 146 +++++++
 .../phi/kernels/cpu/instance_norm_kernel.cc   | 126 ++++++
 .../kernels/gpu/instance_norm_grad_kernel.cu  | 319 +++++++++++++++
 .../phi/kernels/gpu/instance_norm_kernel.cu   | 221 ++++++++++
 paddle/phi/kernels/gpu/instance_norm_utils.h  |  73 ++++
 .../phi/kernels/instance_norm_grad_kernel.h   |  33 ++
 paddle/phi/kernels/instance_norm_kernel.h     |  31 ++
 paddle/phi/ops/compat/instance_norm_sig.cc    |  38 ++
 10 files changed, 987 insertions(+), 604 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/instance_norm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/instance_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/instance_norm_utils.h
 create mode 100644 paddle/phi/kernels/instance_norm_grad_kernel.h
 create mode 100644 paddle/phi/kernels/instance_norm_kernel.h
 create mode 100644 paddle/phi/ops/compat/instance_norm_sig.cc

diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index a7d96437e95c4..2cbd48cf093e2 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -170,104 +170,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
 )DOC");
 }
 
-template <typename T>
-class InstanceNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-
-    const int sample_size = x->numel() / N / C;
-
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-// Once eigen on Windows is updated, the if branch can be removed.
-#ifndef EIGEN_HAS_INDEX_LIST
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-    Eigen::DSizes<int, 1> rdims(1);
-#else
-    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
-    bcast.set(1, sample_size);
-    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
-    C_shape.set(0, C);
-    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
-    NxC_shape.set(0, NxC);
-    Eigen::IndexList<Eigen::type2index<1>> rdims;
-#endif
-
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, saved_mean, static_cast<T>(0));
-    set_constant(dev_ctx, saved_variance, static_cast<T>(0));
-
-    auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
-    auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
-    auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
-
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-    auto x_arr = x_e.reshape(shape);
-
-    saved_mean_e.device(*place) = x_arr.mean(rdims);
-    auto saved_variance_arr =
-        (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
-
-    saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_data;
-    Tensor bias_data;
-    if (!scale) {
-      scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
-    }
-
-    if (!bias) {
-      bias_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &bias_data, static_cast<T>(0));
-    }
-    auto scale_e = scale
-                       ? framework::EigenVector<T>::Flatten(*scale)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(scale_data));
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto bias_e = bias ? framework::EigenVector<T>::Flatten(*bias)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(bias_data));
-    auto bias_arr = bias_e.reshape(C_shape);
-
-    y->mutable_data<T>(ctx.GetPlace());
-    auto y_e = framework::EigenVector<T>::Flatten(*y);
-    auto y_arr = y_e.reshape(shape);
-
-    // (x - mean) * inv_std * scale + bias
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
-                               saved_variance_e.broadcast(bcast) *
-                               scale_arr.broadcast(bcast_param) +
-                           bias_arr.broadcast(bcast_param);
-  }
-};
-
 void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
   OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
@@ -312,120 +214,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class InstanceNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-
-    const auto &x_dims = x->dims();
-
-    const int N = x_dims[0];
-    const int C = x_dims[1];
-    const int NxC = N * C;
-    const int sample_size = x->numel() / N / C;
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto *place = dev_ctx.eigen_device();
-
-    Eigen::DSizes<int, 2> rshape(NxC, sample_size);
-    Eigen::DSizes<int, 2> param_shape(N, C);
-    Eigen::DSizes<int, 2> shape(NxC, sample_size);
-#ifndef EIGEN_HAS_INDEX_LIST
-    Eigen::DSizes<int, 1> rdims(0);
-    Eigen::DSizes<int, 1> mean_rdims(1);
-    Eigen::DSizes<int, 2> bcast(1, sample_size);
-    Eigen::DSizes<int, 2> C_shape(C, 1);
-    Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
-#else
-    Eigen::IndexList<Eigen::type2index<0>> rdims;
-    Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
-    Eigen::IndexList<Eigen::type2index<1>, int> bcast;
-    bcast.set(1, sample_size);
-    Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
-    C_shape.set(0, C);
-    Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
-    NxC_shape.set(0, NxC);
-#endif
-
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    Tensor scale_data;
-    if (!scale) {
-      scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
-    }
-
-    auto scale_e = scale
-                       ? framework::EigenVector<T>::Flatten(*scale)
-                       : framework::EigenVector<T>::Flatten(
-                             const_cast<const framework::Tensor &>(scale_data));
-    auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
-    auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
-    auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
-    auto x_e = framework::EigenVector<T>::Flatten(*x);
-
-    auto scale_arr = scale_e.reshape(C_shape);
-    auto mean_arr = mean_e.reshape(NxC_shape);
-    auto inv_var_arr = inv_var_e.reshape(NxC_shape);
-    auto dy_arr = dy_e.reshape(shape);
-    auto x_arr = x_e.reshape(shape);
-
-    auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
-               inv_var_arr.eval().broadcast(bcast);
-
-    // math: d_bias = np.sum(d_y, axis=(n,h,w))
-    // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, d_scale, static_cast<T>(0));
-      set_constant(dev_ctx, d_bias, static_cast<T>(0));
-
-      auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
-      auto d_scale_data = d_scale_e.reshape(C_shape);
-      auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
-      auto d_bias_data = d_bias_e.reshape(C_shape);
-      d_bias_data.device(*place) =
-          dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
-      d_scale_data.device(*place) =
-          (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
-    }
-
-    auto dy_mean =
-        dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
-
-    Eigen::DSizes<int, 2> bcast_param(N, sample_size);
-    set_constant(dev_ctx, d_x, static_cast<T>(0));
-    // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
-    // axis=(h,w))
-    //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
-    //             mean),
-    //             axis=(h,w))
-    auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
-    auto dx_arr = dx_e.reshape(shape);
-    dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
-                            inv_var_arr.broadcast(bcast) *
-                            (dy_arr - dy_mean -
-                             tmp *
-                                 (dy_arr * tmp)
-                                     .mean(mean_rdims)
-                                     .reshape(NxC_shape)
-                                     .eval()
-                                     .broadcast(bcast));
-  }
-};
-
 void InstanceNormDoubleGradOp::InferShape(
     framework::InferShapeContext *ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
@@ -699,14 +487,6 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
 REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
                   ops::InstanceNormDoubleGradOpInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     instance_norm_grad_grad,
     ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
index e51cd9835318a..192422429371b 100644
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@@ -70,181 +70,6 @@ static __global__ void add_param(const T *input, T *output,
   }
 }
 
-template <typename T>
-class InstanceNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("It must be CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *x = ctx.Input<Tensor>("X");
-    auto &x_dims = x->dims();
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the size of X's dimensions must greater than "
-                          "or equal to 2. But received: "
-                          "the size of X's dimensions is [%d]",
-                          x_dims.size()));
-    PADDLE_ENFORCE_LE(x_dims.size(), 5,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the size of X's dimensions must smaller than"
-                          "or equal to 5. But received: "
-                          "the size of X's dimensions is [%d]",
-                          x_dims.size()));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    Tensor x_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-#endif
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, miopenBNSpatial));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    bias_tmp.mutable_data<T>(ctx.GetPlace());
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min((NxC + block - 1) / block, max_blocks);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-    if (scale) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          scale->data<T>(), scale_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-    if (bias) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          bias->data<T>(), bias_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
-    }
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenBatchNormalizationForwardTraining(
-            handle, miopenBNSpatial,
-            const_cast<void *>(
-                static_cast<const void *>(CudnnDataType<T>::kOne())),
-            const_cast<void *>(
-                static_cast<const void *>(CudnnDataType<T>::kZero())),
-            data_desc_, static_cast<const void *>(x_tmp.template data<T>()),
-            data_desc_,
-            static_cast<void *>(y->template mutable_data<T>(ctx.GetPlace())),
-            in_param_desc_,
-            const_cast<void *>(static_cast<const void *>(
-                scale_tmp.template data<BatchNormParamType<T>>())),
-            const_cast<void *>(static_cast<const void *>(
-                bias_tmp.template data<BatchNormParamType<T>>())),
-            0, nullptr, nullptr, epsilon,
-            static_cast<void *>(
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace())),
-            static_cast<void *>(
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()))));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnBatchNormalizationForwardTraining(
-            handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-            data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-            in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-            bias_tmp.template data<BatchNormParamType<T>>(), 0, nullptr,
-            nullptr, epsilon,
-            saved_mean->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace()),
-            saved_variance->template mutable_data<BatchNormParamType<T>>(
-                ctx.GetPlace())));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-#endif
-  }
-};
-
 template <typename T, int BlockDim>
 static __global__ void GradComputeDX(const T *dy,
                                      const BatchNormParamType<T> *scale,
@@ -297,203 +122,6 @@ static __global__ void GradComputeDX(const T *dy,
   }
 }
 
-template <typename T>
-class InstanceNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-
-    const auto &x_dims = x->dims();
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-
-    Tensor x_tmp, d_y_tmp;
-    x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D});
-    d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D});
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_bias->mutable_data<T>(ctx.GetPlace());
-    }
-    if (scale) {
-      PADDLE_ENFORCE_EQ(
-          scale->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The `shape` in InstanceNormOp is invalid: "
-              "the size of scale's dimensions must be equal to 1. But "
-              "received: the size of scale's dimensions"
-              "is [%d]",
-              scale->dims().size()));
-      PADDLE_ENFORCE_EQ(scale->dims()[0], C,
-                        platform::errors::InvalidArgument(
-                            "The `shape` in InstanceNormOp is invalid: "
-                            "the first dimension of scale must be equal to "
-                            "Channels([%d]). But received: "
-                            "the first dimension of scale is [%d],"
-                            "the dimensions of scale is [%s], ",
-                            C, scale->dims()[0], scale->dims()));
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-
-    const int n = x->numel();
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = std::min(NxC, max_blocks);
-    const int grid1 = (C + block - 1) / block;
-
-    Tensor scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    scale_tmp.mutable_data<T>(ctx.GetPlace());
-    Tensor d_scale_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    Tensor d_bias_tmp =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    if (scale) {
-      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          scale->data<T>(), scale_tmp.data<T>(), N, C);
-    } else {
-      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    dims = {1, NxC, H, W, D};
-    strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
-
-    if ((H * W * D) == 1) {
-      framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      phi::funcs::SetConstant<platform::CUDADeviceContext,
-                              BatchNormParamType<T>>
-          functor;
-      functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-      functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-      return;
-    }
-
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t in_param_desc_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            in_param_desc_, data_desc_, miopenBNSpatial));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
-#endif
-
-    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-    const auto *saved_mean_data =
-        saved_mean->template data<BatchNormParamType<T>>();
-    const auto *saved_var_data =
-        saved_var->template data<BatchNormParamType<T>>();
-    if (d_scale && d_bias) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::miopenBatchNormalizationBackward(
-              dev_ctx.cudnn_handle(), miopenBNSpatial, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_, x_tmp.template data<T>(),
-              data_desc_, d_y_tmp.template data<T>(), data_desc_,
-              d_x->template mutable_data<T>(ctx.GetPlace()), in_param_desc_,
-              scale_tmp.template data<BatchNormParamType<T>>(),
-              d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              epsilon, saved_mean_data, saved_var_data));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationBackward(
-              dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL,
-              CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
-              CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(), data_desc_,
-              x_tmp.template data<T>(), data_desc_, d_y_tmp.template data<T>(),
-              data_desc_, d_x->template mutable_data<T>(ctx.GetPlace()),
-              in_param_desc_, scale_tmp.template data<BatchNormParamType<T>>(),
-              d_scale_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              d_bias_tmp.template mutable_data<BatchNormParamType<T>>(
-                  ctx.GetPlace()),
-              epsilon, saved_mean_data, saved_var_data));
-#endif
-    } else {
-      if (d_x) {
-        GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
-            d_y->data<T>(), scale_tmp.data<BatchNormParamType<T>>(),
-            saved_mean_data, x->data<T>(), saved_var_data, C, H * W * D,
-            d_x->data<T>());
-      }
-    }
-
-    if (d_scale && d_bias) {
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
-    }
-
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
-#endif
-  }
-};
-
 static __device__ __forceinline__ float real_sqrt(float x) {
   return 1. / sqrtf(x);
 }
@@ -793,22 +421,10 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad,
                         ops::InstanceNormDoubleGradKernel<
                             paddle::platform::CUDADeviceContext, float>);
 #else
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm, ops::InstanceNormKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::InstanceNormGradKernel<plat::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     instance_norm_grad_grad,
     ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..07b3c5a18fdb5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& d_y,
+                            paddle::optional<const DenseTensor&> scale,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            float epsilon,
+                            DenseTensor* d_x,
+                            DenseTensor* d_scale,
+                            DenseTensor* d_bias) {
+  const auto* scale_ptr = scale.get_ptr();
+
+  const auto& x_dims = x.dims();
+
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int NxC = N * C;
+  const int sample_size = x.numel() / N / C;
+
+  dev_ctx.template Alloc<T>(d_x);
+  auto* place = dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, 2> rshape(NxC, sample_size);
+  Eigen::DSizes<int, 2> param_shape(N, C);
+  Eigen::DSizes<int, 2> shape(NxC, sample_size);
+#ifndef EIGEN_HAS_INDEX_LIST
+  Eigen::DSizes<int, 1> rdims(0);
+  Eigen::DSizes<int, 1> mean_rdims(1);
+  Eigen::DSizes<int, 2> bcast(1, sample_size);
+  Eigen::DSizes<int, 2> C_shape(C, 1);
+  Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
+#else
+  Eigen::IndexList<Eigen::type2index<0>> rdims;
+  Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
+  Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+  bcast.set(1, sample_size);
+  Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+  C_shape.set(0, C);
+  Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+  NxC_shape.set(0, NxC);
+#endif
+
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+
+  DenseTensor scale_data;
+  if (!scale_ptr) {
+    scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_data);
+    set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+  }
+
+  auto scale_e =
+      scale_ptr
+          ? EigenVector<T>::Flatten(*scale_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
+  auto mean_e = EigenVector<T>::Flatten(saved_mean);
+  auto inv_var_e = EigenVector<T>::Flatten(saved_variance);
+  auto dy_e = EigenVector<T>::Flatten(d_y);
+  auto x_e = EigenVector<T>::Flatten(x);
+
+  auto scale_arr = scale_e.reshape(C_shape);
+  auto mean_arr = mean_e.reshape(NxC_shape);
+  auto inv_var_arr = inv_var_e.reshape(NxC_shape);
+  auto dy_arr = dy_e.reshape(shape);
+  auto x_arr = x_e.reshape(shape);
+
+  auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
+             inv_var_arr.eval().broadcast(bcast);
+
+  // math: d_bias = np.sum(d_y, axis=(n,h,w))
+  // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<T>(d_scale);
+    dev_ctx.template Alloc<T>(d_bias);
+    set_constant(dev_ctx, d_scale, static_cast<T>(0));
+    set_constant(dev_ctx, d_bias, static_cast<T>(0));
+
+    auto d_scale_e = EigenVector<T>::Flatten(*d_scale);
+    auto d_scale_data = d_scale_e.reshape(C_shape);
+    auto d_bias_e = EigenVector<T>::Flatten(*d_bias);
+    auto d_bias_data = d_bias_e.reshape(C_shape);
+    d_bias_data.device(*place) =
+        dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
+    d_scale_data.device(*place) =
+        (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
+  }
+
+  auto dy_mean =
+      dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);
+
+  Eigen::DSizes<int, 2> bcast_param(N, sample_size);
+  set_constant(dev_ctx, d_x, static_cast<T>(0));
+  // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
+  // axis=(h,w))
+  //             - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
+  //             mean),
+  //             axis=(h,w))
+  auto dx_e = EigenVector<T>::Flatten(*d_x);
+  auto dx_arr = dx_e.reshape(shape);
+  dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
+                          inv_var_arr.broadcast(bcast) *
+                          (dy_arr - dy_mean -
+                           tmp *
+                               (dy_arr * tmp)
+                                   .mean(mean_rdims)
+                                   .reshape(NxC_shape)
+                                   .eval()
+                                   .broadcast(bcast));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
new file mode 100644
index 0000000000000..f89ecba901c04
--- /dev/null
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        paddle::optional<const DenseTensor&> scale,
+                        paddle::optional<const DenseTensor&> bias,
+                        float epsilon_f,
+                        DenseTensor* y,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance) {
+  const auto& x_dims = x.dims();
+  T epsilon = static_cast<T>(epsilon_f);
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int NxC = N * C;
+  const int sample_size = x.numel() / N / C;
+  auto* place = dev_ctx.eigen_device();
+
+  Eigen::DSizes<int, 2> shape(NxC, sample_size);
+// Once eigen on Windows is updated, the if branch can be removed.
+#ifndef EIGEN_HAS_INDEX_LIST
+  Eigen::DSizes<int, 2> bcast(1, sample_size);
+  Eigen::DSizes<int, 2> C_shape(C, 1);
+  Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
+  Eigen::DSizes<int, 1> rdims(1);
+#else
+  Eigen::IndexList<Eigen::type2index<1>, int> bcast;
+  bcast.set(1, sample_size);
+  Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
+  C_shape.set(0, C);
+  Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
+  NxC_shape.set(0, NxC);
+  Eigen::IndexList<Eigen::type2index<1>> rdims;
+#endif
+
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+  dev_ctx.template Alloc<T>(saved_mean);
+  dev_ctx.template Alloc<T>(saved_variance);
+  set_constant(dev_ctx, saved_mean, static_cast<T>(0));
+  set_constant(dev_ctx, saved_variance, static_cast<T>(0));
+
+  auto saved_mean_a = EigenVector<T>::Flatten(*saved_mean);
+  auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
+  auto saved_variance_a = EigenVector<T>::Flatten(*saved_variance);
+  auto saved_variance_e = saved_variance_a.reshape(NxC_shape);
+
+  auto x_e = EigenVector<T>::Flatten(x);
+  auto x_arr = x_e.reshape(shape);
+
+  saved_mean_e.device(*place) = x_arr.mean(rdims);
+  auto saved_variance_arr =
+      (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;
+
+  saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_data;
+  DenseTensor bias_data;
+  if (!scale_ptr) {
+    scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_data);
+    set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+  }
+
+  if (!bias_ptr) {
+    bias_data.Resize({C});
+    dev_ctx.template Alloc<T>(&bias_data);
+    set_constant(dev_ctx, &bias_data, static_cast<T>(0));
+  }
+  auto scale_e =
+      scale_ptr
+          ? EigenVector<T>::Flatten(*scale_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(scale_data));
+  auto scale_arr = scale_e.reshape(C_shape);
+  auto bias_e =
+      bias_ptr
+          ? EigenVector<T>::Flatten(*bias_ptr)
+          : EigenVector<T>::Flatten(const_cast<const DenseTensor&>(bias_data));
+  auto bias_arr = bias_e.reshape(C_shape);
+
+  dev_ctx.template Alloc<T>(y);
+  auto y_e = EigenVector<T>::Flatten(*y);
+  auto y_arr = y_e.reshape(shape);
+
+  // (x - mean) * inv_std * scale + bias
+  Eigen::DSizes<int, 2> bcast_param(N, sample_size);
+  y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
+                             saved_variance_e.broadcast(bcast) *
+                             scale_arr.broadcast(bcast_param) +
+                         bias_arr.broadcast(bcast_param);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    instance_norm, CPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..15c9c30626593
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -0,0 +1,319 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+
+namespace phi {
+
+template <typename T, int BlockDim>
+static __global__ void GradComputeDX(const T *dy,
+                                     const BatchNormParamType<T> *scale,
+                                     const BatchNormParamType<T> *mean,
+                                     const T *x,
+                                     const BatchNormParamType<T> *variance,
+                                     const int C,
+                                     const int sample_size,
+                                     T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  BatchNormParamType<T> mean_val = mean[ncid];
+  BatchNormParamType<T> inv_var_val = variance[ncid];
+
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+
+  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+  BatchNormParamType<T> dy_x_sub_mean_sum =
+      static_cast<BatchNormParamType<T>>(0);
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    dy_sum += dy_i;
+    dy_x_sub_mean_sum +=
+        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_x_sub_mean_sum =
+      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    dx[i] =
+        (static_cast<BatchNormParamType<T>>(dy[i]) -
+         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
+         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
+             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
+        scale[c] * inv_var_val;
+  }
+}
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context &dev_ctx,
+                            const DenseTensor &x,
+                            const DenseTensor &d_y,
+                            paddle::optional<const DenseTensor &> scale,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            float epsilon_f,
+                            DenseTensor *d_x,
+                            DenseTensor *d_scale,
+                            DenseTensor *d_bias) {
+  double epsilon = static_cast<double>(epsilon_f);
+  const auto *scale_ptr = scale.get_ptr();
+
+  const auto &x_dims = x.dims();
+
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(
+      x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+
+  DenseTensor x_tmp, d_y_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
+
+  dev_ctx.template Alloc<T>(d_x);
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<T>(d_scale);
+    dev_ctx.template Alloc<T>(d_bias);
+  }
+  if (scale_ptr) {
+    PADDLE_ENFORCE_EQ(
+        scale_ptr->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The `shape` in InstanceNormOp is invalid: "
+            "the size of scale's dimensions must be equal to 1. But "
+            "received: the size of scale's dimensions"
+            "is [%d]",
+            scale_ptr->dims().size()));
+    PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
+                      C,
+                      phi::errors::InvalidArgument(
+                          "The `shape` in InstanceNormOp is invalid: "
+                          "the first dimension of scale must be equal to "
+                          "Channels([%d]). But received: "
+                          "the first dimension of scale is [%d],"
+                          "the dimensions of scale is [%s], ",
+                          C,
+                          scale_ptr->dims()[0],
+                          scale_ptr->dims()));
+  }
+
+  phi::funcs::SetConstant<GPUContext, T> set_constant;
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min(NxC, max_blocks);
+  const int grid1 = (C + block - 1) / block;
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&scale_tmp);
+
+  DenseTensor d_scale_tmp;
+  d_scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&d_scale_tmp);
+
+  DenseTensor d_bias_tmp;
+  d_bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&d_bias_tmp);
+
+  if (scale_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+  if ((H * W * D) == 1) {
+    phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x);
+    phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+    functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+    functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+    return;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSetTensorDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          const_cast<int *>(dims.data()),
+          const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto *saved_mean_data =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_variance.template data<BatchNormParamType<T>>();
+  if (d_scale && d_bias) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::miopenBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            miopenBNSpatial,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationBackward(
+            dev_ctx.cudnn_handle(),
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            x_tmp.template data<T>(),
+            data_desc_,
+            d_y_tmp.template data<T>(),
+            data_desc_,
+            d_x->template data<T>(),
+            in_param_desc_,
+            scale_tmp.template data<BatchNormParamType<T>>(),
+            d_scale_tmp.template data<BatchNormParamType<T>>(),
+            d_bias_tmp.template data<BatchNormParamType<T>>(),
+            epsilon,
+            saved_mean_data,
+            saved_var_data));
+#endif
+  } else {
+    if (d_x) {
+      GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
+          d_y.data<T>(),
+          scale_tmp.data<BatchNormParamType<T>>(),
+          saved_mean_data,
+          x.data<T>(),
+          saved_var_data,
+          C,
+          H * W * D,
+          d_x->data<T>());
+    }
+  }
+
+  if (d_scale && d_bias) {
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {}
+#else
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
new file mode 100644
index 0000000000000..cf8f0fb78788c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -0,0 +1,221 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context &dev_ctx,
+                        const DenseTensor &x,
+                        paddle::optional<const DenseTensor &> scale,
+                        paddle::optional<const DenseTensor &> bias,
+                        float epsilon_f,
+                        DenseTensor *y,
+                        DenseTensor *saved_mean,
+                        DenseTensor *saved_variance) {
+  double epsilon = static_cast<double>(epsilon_f);
+  auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    phi::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must greater than "
+                        "or equal to 2. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must smaller than"
+                        "or equal to 5. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(
+      x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  DenseTensor x_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  dev_ctx.template Alloc<T>(y);
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenSetTensorDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          const_cast<int *>(dims.data()),
+          const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&scale_tmp);
+  DenseTensor bias_tmp;
+  bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<T>(&bias_tmp);
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min((NxC + block - 1) / block, max_blocks);
+
+  phi::funcs::SetConstant<GPUContext, T> set_constant;
+  if (scale_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  if (bias_ptr) {
+    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<T>(), bias_tmp.data<T>(), N, C);
+  } else {
+    set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
+  }
+
+  auto handle = dev_ctx.cudnn_handle();
+
+  phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+  dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+  dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+  functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+  functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenBatchNormalizationForwardTraining(
+          handle,
+          miopenBNSpatial,
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kOne())),
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kZero())),
+          data_desc_,
+          static_cast<const void *>(x_tmp.template data<T>()),
+          data_desc_,
+          static_cast<void *>(y->template data<T>()),
+          in_param_desc_,
+          const_cast<void *>(static_cast<const void *>(
+              scale_tmp.template data<BatchNormParamType<T>>())),
+          const_cast<void *>(static_cast<const void *>(
+              bias_tmp.template data<BatchNormParamType<T>>())),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          static_cast<void *>(
+              saved_mean->template data<BatchNormParamType<T>>()),
+          static_cast<void *>(
+              saved_variance->template data<BatchNormParamType<T>>())));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+          handle,
+          CUDNN_BATCHNORM_SPATIAL,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x_tmp.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          in_param_desc_,
+          scale_tmp.template data<BatchNormParamType<T>>(),
+          bias_tmp.template data<BatchNormParamType<T>>(),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          saved_mean->template data<BatchNormParamType<T>>(),
+          saved_variance->template data<BatchNormParamType<T>>()));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+// MIOPEN do not support double
+PD_REGISTER_KERNEL(
+    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {}
+#else
+PD_REGISTER_KERNEL(
+    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
new file mode 100644
index 0000000000000..50dfe4ad222c0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+static __global__ void repeat_param(const T *input,
+                                    T *output,
+                                    const int repeat_num,
+                                    const int C) {
+  CUDA_KERNEL_LOOP(i, repeat_num * C) {
+    int index = i % C;
+    output[i] = input[index];
+  }
+}
+
+template <typename T, int BlockDim, bool AVG>
+static __global__ void add_param(const T *input,
+                                 T *output,
+                                 const int repeat_num,
+                                 const int C) {
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ou_storage;
+  for (int i = blockIdx.x; i < C; i += gridDim.x) {
+    T ou = static_cast<T>(0);
+    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
+      const int index = j * C + i;
+      ou += static_cast<T>(input[index]);
+    }
+    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
+    if (threadIdx.x == 0) {
+      output[i] = ou;
+    }
+    __syncthreads();
+
+    if (AVG) {
+      output[i] /= repeat_num;
+    }
+  }
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
new file mode 100644
index 0000000000000..041302a7cfb67
--- /dev/null
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y_grad,
+                            paddle::optional<const DenseTensor&> scale,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            float epsilon,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/instance_norm_kernel.h b/paddle/phi/kernels/instance_norm_kernel.h
new file mode 100644
index 0000000000000..8c50025a73ce0
--- /dev/null
+++ b/paddle/phi/kernels/instance_norm_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        paddle::optional<const DenseTensor&> scale,
+                        paddle::optional<const DenseTensor&> bias,
+                        float epsilon,
+                        DenseTensor* y,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc
new file mode 100644
index 0000000000000..b65e84588db13
--- /dev/null
+++ b/paddle/phi/ops/compat/instance_norm_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature InstanceNormOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon"},
+                         {"Y", "SavedMean", "SavedVariance"});
+}
+
+KernelSignature InstanceNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm_grad",
+                         {"X", "Y@GRAD", "Scale", "SavedMean", "SavedVariance"},
+                         {"epsilon"},
+                         {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad,
+                           phi::InstanceNormGradOpArgumentMapping);

From 18323a463ae57447922207a9c8433dc81db5b330 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Thu, 26 May 2022 16:12:58 +0800
Subject: [PATCH 045/109] fix protobuf error (#43009)

---
 python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index 4192c6b3d777a..74f2c2b9401aa 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,6 +1,6 @@
 requests>=2.20.0
 numpy>=1.13
-protobuf>=3.1.0
+protobuf>=3.1.0, <=3.20.0
 Pillow
 six
 decorator

From eb15e9a7aa51cfda6441b0648efdc3db76b4546d Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Thu, 26 May 2022 16:57:24 +0800
Subject: [PATCH 046/109] enhance yolo_box_fuse_pass (#42926)

---
 paddle/fluid/framework/ir/yolo_box_fuse_pass.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
index c974d334a8de0..20075a49749f7 100644
--- a/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
@@ -199,9 +199,11 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE(nms_out_rois_num);
 #undef GET_IR_NODE
 
+    auto* block = yolo_box0->Op()->Block();
+
 // create yolo_box_head
 #define CREATE_YOLO_BOX_HEAD(idx_)                                         \
-  framework::OpDesc yolo_box_head##idx_##_op_desc;                         \
+  framework::OpDesc yolo_box_head##idx_##_op_desc(block);                  \
   yolo_box_head##idx_##_op_desc.SetType("yolo_box_head");                  \
   yolo_box_head##idx_##_op_desc.SetInput("X",                              \
                                          {yolo_box##idx_##_in_x->Name()}); \
@@ -222,7 +224,7 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
 #undef CREATE_YOLO_BOX_HEAD
 
     // create yolo_box_post
-    framework::OpDesc yolo_box_post_op_desc;
+    framework::OpDesc yolo_box_post_op_desc(block);
     yolo_box_post_op_desc.SetType("yolo_box_post");
     yolo_box_post_op_desc.SetInput("Boxes0", {yolo_box0_out_boxes->Name()});
     yolo_box_post_op_desc.SetInput("Boxes1", {yolo_box1_out_boxes->Name()});

From 6af32a7fe57095619021d202ffbba37337fc5f19 Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Thu, 26 May 2022 17:02:10 +0800
Subject: [PATCH 047/109] delete id 0 (#42951)

delete id 0 in gpups
---
 .../fleet/heter_ps/hashtable_kernel.cu        | 21 ++++++++++++++++++-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   |  3 ---
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 6b0141f546c66..57741c2c19b1c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -106,6 +106,23 @@ __global__ void dy_mf_search_kernel(Table* table,
       for (int j = 0; j < cur->mf_dim + 1; ++j) {
         cur->mf[j] = input.mf[j];
       }
+    } else {
+      if (keys[i] != 0) {
+        printf("warning::pull miss key: %d", keys[i]);
+      }
+      FeatureValue* cur = (FeatureValue*)(vals + i * pull_feature_value_size);
+      cur->delta_score = 0;
+      cur->show = 0;
+      cur->clk = 0;
+      cur->slot = -1;
+      cur->lr = 0;
+      cur->lr_g2sum = 0;
+      cur->mf_size = 0;
+      cur->mf_dim = 8;
+      cur->cpu_ptr;
+      for (int j = 0; j < cur->mf_dim + 1; j++) {
+        cur->mf[j] = 0;
+      }
     }
   }
 }
@@ -138,7 +155,9 @@ __global__ void dy_mf_update_kernel(Table* table,
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
       sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
     } else {
-      printf("warning: push miss key: %d", keys[i]);
+      if (keys[i] != 0) {
+        printf("warning::push miss key: %d", keys[i]);
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index ac08e37aec1fc..65892f8488475 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -239,9 +239,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
   for (int i = 0; i < thread_keys_shard_num_; i++) {
     for (int j = 0; j < multi_mf_dim_; j++) {
-      if (i == 0 && j == multi_mf_dim_ - 1) {
-        gpu_task->feature_dim_keys_[i][j].push_back(0);
-      }
       VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
               << " key len: " << gpu_task->feature_dim_keys_[i][j].size();
       gpu_task->value_dim_ptr_[i][j].resize(

From b2b78cd416f8bd7d27cf3a18fccc8bf6d6f56cb5 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 26 May 2022 21:31:07 +0800
Subject: [PATCH 048/109] move instance_norm_double_grad (#43021)

---
 paddle/fluid/operators/instance_norm_op.cc    | 367 +--------------
 paddle/fluid/operators/instance_norm_op.cu    | 434 ------------------
 paddle/fluid/operators/instance_norm_op.h     |  35 --
 paddle/phi/infermeta/backward.cc              |  57 +++
 paddle/phi/infermeta/backward.h               |  24 +
 paddle/phi/infermeta/ternary.cc               | 105 +++++
 paddle/phi/infermeta/ternary.h                |   9 +
 .../kernels/cpu/instance_norm_grad_kernel.cc  | 202 ++++++++
 paddle/phi/kernels/funcs/norm_utils.h         |  46 ++
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   4 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  | 341 +++++++++++++-
 .../phi/kernels/gpu/instance_norm_kernel.cu   |   5 +-
 .../phi/kernels/instance_norm_grad_kernel.h   |  15 +
 paddle/phi/ops/compat/instance_norm_sig.cc    |  18 +
 15 files changed, 832 insertions(+), 834 deletions(-)
 delete mode 100644 paddle/fluid/operators/instance_norm_op.cu
 create mode 100644 paddle/phi/kernels/funcs/norm_utils.h

diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index 2cbd48cf093e2..de92de453a354 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -17,93 +17,16 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
-                 "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
-                 "InstanceNorm");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_NE(phi::product(x_dims), 0,
-                    platform::errors::PreconditionNotMet(
-                        "The Input variable X(%s) has not "
-                        "been initialized. You may need to confirm "
-                        "if you put exe.run(startup_program) "
-                        "after optimizer.minimize function.",
-                        ctx->Inputs("X").front()));
-  PADDLE_ENFORCE_GE(
-      x_dims.size(), 2,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of input X must "
-          "greater than or equal to 2. But received: the shape of input "
-          "X = [%s], the dimension of input X =[%d]",
-          x_dims, x_dims.size()));
-  PADDLE_ENFORCE_LE(
-      x_dims.size(), 5,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of input X must "
-          "smaller than or equal to 5, But received: the shape of input "
-          "X = [%s], the dimension of input X = [%d]",
-          x_dims, x_dims.size()));
-  auto N = x_dims[0];
-  auto C = x_dims[1];
-  auto NxC = N * C;
-
-  if (ctx->HasInput("Scale")) {
-    auto scale_dim = ctx->GetInputDim("Scale");
-
-    PADDLE_ENFORCE_EQ(
-        scale_dim.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of scale must equal to 1."
-            "But received: the shape of scale is [%s], the dimension "
-            "of scale is [%d]",
-            scale_dim, scale_dim.size()));
-
-    bool check = !((!ctx->IsRuntime()) && (phi::product(scale_dim) <= 0));
-
-    if (check) {
-      PADDLE_ENFORCE_EQ(scale_dim[0], C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of scale must equal to [%d]"
-                            "But received: the shape of scale is [%d]",
-                            C, scale_dim[0]));
-    }
-  }
-  if (ctx->HasInput("Bias")) {
-    auto bias_dim = ctx->GetInputDim("Bias");
-    PADDLE_ENFORCE_EQ(
-        bias_dim.size(), 1UL,
-        platform::errors::InvalidArgument(
-            "ShapeError: the dimension of bias must equal to 1."
-            "But received: the shape of bias is [%s],the dimension "
-            "of bias is [%d]",
-            bias_dim, bias_dim.size()));
-
-    bool check = !((!ctx->IsRuntime()) && (phi::product(bias_dim) <= 0));
-    if (check) {
-      PADDLE_ENFORCE_EQ(bias_dim[0], C,
-                        platform::errors::InvalidArgument(
-                            "ShapeError: the shape of bias must equal to [%d]"
-                            "But received: the shape of bias is [%d]",
-                            C, bias_dim[0]));
-    }
-  }
-
-  ctx->SetOutputDim("Y", x_dims);
-  ctx->SetOutputDim("SavedMean", {NxC});
-  ctx->SetOutputDim("SavedVariance", {NxC});
-  ctx->ShareLoD("X", "Y");
-}
-
 framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
@@ -170,29 +93,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
 )DOC");
 }
 
-void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
-                 framework::GradVarName("Y"), "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
-                 "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
-                 "InstanceNormGrad");
-
-  // check output
-  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
-                 framework::GradVarName("X"), "InstanceNormGrad");
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
-  }
-  if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
-  }
-}
-
 framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar(framework::GradVarName("Y"));
@@ -214,34 +114,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-void InstanceNormDoubleGradOp::InferShape(
-    framework::InferShapeContext *ctx) const {
-  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX",
-                 "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "InstanceNormDoubleGrad");
-
-  // check output
-  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX",
-                 "InstanceNormDoubleGrad");
-
-  const auto x_dims = ctx->GetInputDim("X");
-  const int C = x_dims[1];
-  if (ctx->HasOutput("DX")) {
-    ctx->SetOutputDim("DX", x_dims);
-  }
-  if (ctx->HasOutput("DScale")) {
-    ctx->SetOutputDim("DScale", {C});
-  }
-  if (ctx->HasOutput("DDY")) {
-    ctx->ShareDim("X", "DDY");
-  }
-}
-
 framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   const auto *var = ctx.InputVar("DY");
@@ -263,213 +135,6 @@ framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType(
       OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
 }
 
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
-    const auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    const int sample_size = X->numel() / N / C;
-    const int NxC = N * C;
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *inv_var_data = Saved_variance->data<T>();
-    Tensor mean_tensor;
-    Tensor inv_var_tensor;
-    ConstEigenArrayMap<T> x_arr(X->data<T>(), sample_size, NxC);
-    ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
-    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
-
-    Tensor mean_tile;
-    mean_tile.Resize({sample_size, NxC});
-    mean_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
-                                    sample_size, NxC);
-
-    Tensor inv_var_tile;
-    inv_var_tile.Resize({sample_size, NxC});
-    inv_var_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> inv_var_tile_data(
-        inv_var_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-    mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
-    inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
-
-    Tensor Scale_data;
-    if (!Scale) {
-      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
-      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
-    }
-    ConstEigenVectorArrayMap<T> scale_arr(
-        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
-
-    Tensor scale_tile;
-    scale_tile.Resize({sample_size, NxC});
-    scale_tile.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
-                                     sample_size, NxC);
-    scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
-
-    ConstEigenArrayMap<T> dy_arr(dY->data<T>(), sample_size, NxC);
-    ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
-
-    // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
-    // axis=(h,w)) *
-    //          np.sum(dy, axis=(h,w)) -
-    //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
-    //          axis=(h,w)) * inv_var.pow(2) *
-    //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
-    //          np.sum(ddx * (x - mean)) *
-    //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
-    //          np.sum(dy,
-    //          axis=(h,w)) * (x - mean) *
-    //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
-    //          inv_var *
-    //          np.mean(dy, axis=(h,w)) -
-    //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
-    //          axis=(h,w)))
-
-    Tensor x_sub_mean_mul_invstd;
-    x_sub_mean_mul_invstd.Resize({sample_size, NxC});
-    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
-    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
-        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), sample_size,
-        NxC);
-    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
-
-    if (dX) {
-      dX->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dX, static_cast<T>(0));
-      EigenArrayMap<T> dx_arr(dX->mutable_data<T>(ctx.GetPlace()), sample_size,
-                              NxC);
-
-      if (ddX) {
-        dx_arr +=
-            x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
-            sample_size *
-            (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
-             (dy_arr * ddx_arr).colwise().sum() +
-             3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
-                 (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                 sample_size);
-
-        dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (dy_arr.colwise().sum() / sample_size - dy_arr);
-
-        dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                  sample_size * inv_var_tile_data * inv_var_tile_data *
-                  (ddx_arr.colwise().sum() / sample_size - ddx_arr);
-
-        dx_arr = scale_tile_data * dx_arr;
-      }
-      if (ddScale) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        dx_arr += (dy_arr * inv_var_tile_data -
-                   dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
-                   x_sub_mean_mul_invstd_arr * inv_var_tile_data *
-                       (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                       sample_size) *
-                  ddscale_tile_data;
-      }
-    }
-    if (dScale) {
-      // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
-      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
-      dScale->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, dScale, static_cast<T>(0));
-      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
-                                        C);
-      if (ddX) {
-        Tensor first_grad;
-        first_grad.Resize({sample_size, NxC});
-        first_grad.mutable_data<T>(ctx.GetPlace());
-        set_constant(dev_ctx, &first_grad, static_cast<T>(0));
-        EigenArrayMap<T> first_grad_arr(
-            first_grad.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-
-        first_grad_arr +=
-            inv_var_tile_data *
-            (dy_arr -
-             dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
-             x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr)
-                     .colwise()
-                     .sum()
-                     .replicate(sample_size, 1) /
-                 sample_size);
-        first_grad_arr = first_grad_arr * ddx_arr;
-        for (int nc = 0; nc < NxC; ++nc) {
-          int c = nc % C;
-          dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
-        }
-      }
-    }
-    if (ddY) {
-      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
-      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
-      //           np.mean(ddx * (x - mean), axis=(h,w)))
-      ddY->mutable_data<T>(ctx.GetPlace());
-      set_constant(dev_ctx, ddY, static_cast<T>(0));
-      EigenArrayMap<T> ddy_arr(ddY->mutable_data<T>(ctx.GetPlace()),
-                               sample_size, NxC);
-      if (ddX) {
-        ddy_arr += scale_tile_data * inv_var_tile_data *
-                   (ddx_arr - ddx_arr.colwise().sum() / sample_size -
-                    x_sub_mean_mul_invstd_arr *
-                        (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
-                        sample_size);
-      }
-      if (ddScale && ddBias) {
-        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
-        ddscale_tile.Resize({sample_size, NxC});
-        ddscale_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddscale_tile_data(
-            ddscale_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
-
-        ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-        Tensor ddbias_tile;
-        ddbias_tile.Resize({sample_size, NxC});
-        ddbias_tile.mutable_data<T>(ctx.GetPlace());
-        EigenArrayMap<T> ddbias_tile_data(
-            ddbias_tile.mutable_data<T>(ctx.GetPlace()), sample_size, NxC);
-        ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
-
-        ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
-        ddy_arr += ddbias_tile_data;
-      }
-    }
-  }
-};
-
 DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer,
                            {"DY", "DDY"});
 
@@ -477,22 +142,26 @@ DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(instance_norm, InstanceNormInferShapeFunctor,
+                            PD_INFER_META(phi::InstanceNormInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(instance_norm_grad,
+                            InstanceNormGradInferShapeFunctor,
+                            PD_INFER_META(phi::InstanceNormGradInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(
+    instance_norm_grad_grad, InstanceNormDoubleGradInferShapeFunctor,
+    PD_INFER_META(phi::InstanceNormDoubleGradInferMeta));
 REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker,
                   ops::InstanceNormOpInferVarType,
                   ops::InstanceNormGradMaker<paddle::framework::OpDesc>,
-                  ops::InstanceNormGradMaker<paddle::imperative::OpBase>);
+                  ops::InstanceNormGradMaker<paddle::imperative::OpBase>,
+                  InstanceNormInferShapeFunctor);
 REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
                   ops::InstanceNormDoubleGradMaker<paddle::framework::OpDesc>,
-                  ops::InstanceNormDoubleGradMaker<paddle::imperative::OpBase>);
+                  ops::InstanceNormDoubleGradMaker<paddle::imperative::OpBase>,
+                  InstanceNormGradInferShapeFunctor);
 REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
-                  ops::InstanceNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
+                  ops::InstanceNormDoubleGradOpInplaceInferer,
+                  InstanceNormDoubleGradInferShapeFunctor);
 
 REGISTER_OP_VERSION(instance_norm)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
deleted file mode 100644
index 192422429371b..0000000000000
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ /dev/null
@@ -1,434 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <cfloat>
-#include <string>
-#include <vector>
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/instance_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using DataLayout = framework::DataLayout;
-template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
-template <typename T>
-using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
-template <typename T>
-static __global__ void repeat_param(const T *input, T *output,
-                                    const int repeat_num, const int C) {
-  CUDA_KERNEL_LOOP(i, repeat_num * C) {
-    int index = i % C;
-    output[i] = input[index];
-  }
-}
-
-template <typename T, int BlockDim, bool AVG>
-static __global__ void add_param(const T *input, T *output,
-                                 const int repeat_num, const int C) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ou_storage;
-  for (int i = blockIdx.x; i < C; i += gridDim.x) {
-    T ou = static_cast<T>(0);
-    for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
-      const int index = j * C + i;
-      ou += static_cast<T>(input[index]);
-    }
-    ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
-    if (threadIdx.x == 0) {
-      output[i] = ou;
-    }
-    __syncthreads();
-
-    if (AVG) {
-      output[i] /= repeat_num;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-static __global__ void GradComputeDX(const T *dy,
-                                     const BatchNormParamType<T> *scale,
-                                     const BatchNormParamType<T> *mean,
-                                     const T *x,
-                                     const BatchNormParamType<T> *variance,
-                                     const int C, const int sample_size,
-                                     T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  BatchNormParamType<T> mean_val = mean[ncid];
-  BatchNormParamType<T> inv_var_val = variance[ncid];
-
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-  BatchNormParamType<T> dy_x_sub_mean_sum =
-      static_cast<BatchNormParamType<T>>(0);
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
-    dy_sum += dy_i;
-    dy_x_sub_mean_sum +=
-        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_x_sub_mean_sum =
-      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    dx[i] =
-        (static_cast<BatchNormParamType<T>>(dy[i]) -
-         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
-         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
-             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
-        scale[c] * inv_var_val;
-  }
-}
-
-static __device__ __forceinline__ float real_sqrt(float x) {
-  return 1. / sqrtf(x);
-}
-static __device__ __forceinline__ double real_sqrt(double x) {
-  return 1. / sqrt(x);
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, int C, int sample_size,
-                                    const double epsilon, T *dx) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T ddx_sum_val;
-  __shared__ T dy_mul_ddx_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T ddx_sum = 0;
-  T dy_mul_ddx_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    T dy_i = dy[i];
-    T tmp = x[i] - mean_val;
-
-    dy_sum += dy_i;
-    ddx_sum += ddx_i;
-    dy_mul_ddx_sum += (ddx_i * dy_i);
-
-    dy_mul_x_sub_mean_sum += (dy_i * tmp);
-    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
-  }
-
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  dy_mul_ddx_sum =
-      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    ddx_sum_val = ddx_sum;
-    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] +=
-          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
-               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
-                3. * dy_mul_x_sub_mean_sum_val * var_val *
-                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
-           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (dy_sum_val / sample_size - dy[i]) +
-           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (ddx_sum_val / sample_size - ddx[i])) *
-          scale[c];
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
-                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
-                    var_val / sample_size) *
-               ddscale[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, int C, int sample_size,
-                                     const double epsilon, T *ddy) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ddx_storage;
-  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T ddx_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T ddx_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    ddx_sum += ddx_i;
-    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
-  }
-  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
-  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
-                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    ddx_sum_val = ddx_sum;
-    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += scale[c] * var_val *
-                (ddx[i] - ddx_sum_val / sample_size -
-                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
-                     var_val / sample_size);
-    }
-  }
-  __syncthreads();
-  if (ddscale != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
-    }
-  }
-  __syncthreads();
-  if (ddbias != nullptr) {
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += ddbias[c];
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, int C, int sample_size,
-                                        const double epsilon, T *dscale) {
-  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
-  int end_idx = (blockIdx.x + 1) * sample_size;
-  int ncid = blockIdx.x;
-  int c = ncid % C;
-
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
-  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T dy_i = dy[i];
-    dy_sum += dy_i;
-    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
-  }
-  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
-                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    dy_sum_val = dy_sum;
-    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
-  }
-  __syncthreads();
-
-  if (ddx != nullptr) {
-    T dscale_tmp = 0;
-    for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp +=
-          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
-                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
-                                  var_val * var_val / sample_size);
-    }
-    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dscale[ncid] += dscale_tmp;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T>
-class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    const T *x_data = X->data<T>();
-    const T *dy_data = dY->data<T>();
-    const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
-
-    const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
-    const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
-
-    const T *mean_data = Saved_mean->data<T>();
-    const T *variance_data = Saved_variance->data<T>();
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-
-    auto &x_dims = X->dims();
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
-    int NxC = N * C;
-    const int n = X->numel();
-    int sample_size = n / N / C;
-
-    Tensor scale_tmp;
-    if (!Scale) {
-      scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-      set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
-    }
-    const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
-    const int block = 512;
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    const int grid = NxC;
-    const int grid1 = (C + block - 1) / block;
-
-    if (dX) {
-      T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dX, static_cast<T>(0));
-      DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
-          ddscale_data, C, sample_size, epsilon, dx_data);
-    }
-    if (dScale) {
-      Tensor dscale_tmp =
-          ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-      set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
-      T *dscale_tmp_data = dscale_tmp.mutable_data<T>(ctx.GetPlace());
-
-      T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, dScale, static_cast<T>(0));
-      DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size,
-          epsilon, dscale_tmp_data);
-      add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-          dscale_tmp.data<T>(), dScale->data<T>(), N, C);
-    }
-    if (ddY) {
-      T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, ddY, static_cast<T>(0));
-      DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data,
-          scale_data, C, sample_size, epsilon, ddy_data);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(instance_norm_grad_grad,
-                        ops::InstanceNormDoubleGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    instance_norm_grad_grad,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::InstanceNormDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
-#endif
diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h
index 493f54ab3baa6..265e4acef0d7a 100644
--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -16,9 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/norm_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -27,22 +25,9 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class InstanceNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -52,7 +37,6 @@ class InstanceNormOp : public framework::OperatorWithKernel {
 class InstanceNormGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -62,7 +46,6 @@ class InstanceNormGradOp : public framework::OperatorWithKernel {
 class InstanceNormDoubleGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override;
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -130,23 +113,5 @@ class InstanceNormOpInferVarType
   }
 };
 
-template <typename DeviceContext, typename T>
-class InstanceNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename DeviceContext, typename T>
-class InstanceNormDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 602942abf4d34..6b13a28c70837 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -312,6 +312,63 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
   dx->share_meta(dout);
 }
 
+void InstanceNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& y_grad,
+                               paddle::optional<const MetaTensor&> scale,
+                               const MetaTensor& saved_mean,
+                               const MetaTensor& saved_variance,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad,
+                               MetaTensor* bias_grad) {
+  PADDLE_ENFORCE_NE(
+      x_grad,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The X@GRAD in InstanceNormGradInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  const int C = x_dims[1];
+  x_grad->set_dims(x_dims);
+  x_grad->set_dtype(x.dtype());
+  x_grad->set_layout(x.layout());
+  if (scale_grad) {
+    scale_grad->set_dims({C});
+  }
+  if (bias_grad) {
+    bias_grad->set_dims({C});
+  }
+}
+void InstanceNormDoubleGradInferMeta(
+    const MetaTensor& x,
+    paddle::optional<const MetaTensor&> scale,
+    const MetaTensor& saved_mean,
+    const MetaTensor& saved_variance,
+    const MetaTensor& dy,
+    paddle::optional<const MetaTensor&> ddx,
+    paddle::optional<const MetaTensor&> ddscale,
+    paddle::optional<const MetaTensor&> ddbias,
+    float epsilon,
+    MetaTensor* dx,
+    MetaTensor* dscale,
+    MetaTensor* ddy) {
+  PADDLE_ENFORCE_NE(
+      dx,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The DX in InstanceNormDoubleGradInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  const int C = x_dims[1];
+  dx->set_dims(x_dims);
+  dx->set_dtype(x.dtype());
+  dx->set_layout(x.layout());
+  if (dscale) {
+    dscale->set_dims({C});
+  }
+  if (ddy) {
+    ddy->share_dims(x);
+  }
+}
+
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx) {
   auto xshape_dims = xshape.dims();
   auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index c35b58d0f56e4..855b25d7ed4f8 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -144,6 +144,30 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 int axis,
                                 MetaTensor* dx);
 
+void InstanceNormGradInferMeta(const MetaTensor& x,
+                               const MetaTensor& y_grad,
+                               paddle::optional<const MetaTensor&> scale,
+                               const MetaTensor& saved_mean,
+                               const MetaTensor& saved_variance,
+                               float epsilon,
+                               MetaTensor* x_grad,
+                               MetaTensor* scale_grad,
+                               MetaTensor* bias_grad);
+
+void InstanceNormDoubleGradInferMeta(
+    const MetaTensor& x,
+    paddle::optional<const MetaTensor&> scale,
+    const MetaTensor& saved_mean,
+    const MetaTensor& saved_variance,
+    const MetaTensor& dy,
+    paddle::optional<const MetaTensor&> ddx,
+    paddle::optional<const MetaTensor&> ddscale,
+    paddle::optional<const MetaTensor&> ddbias,
+    float epsilon,
+    MetaTensor* dx,
+    MetaTensor* dscale,
+    MetaTensor* ddy);
+
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx);
 
 void MaxPoolWithIndexGradInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index ae8c7dd61c3bb..e3f946b247f09 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -191,6 +191,111 @@ void ArangeInferMeta(const MetaTensor& start,
   out->set_dtype(start.dtype());
 }
 
+void InstanceNormInferMeta(const MetaTensor& x,
+                           paddle::optional<const MetaTensor&> scale,
+                           paddle::optional<const MetaTensor&> bias,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* saved_mean,
+                           MetaTensor* saved_variance,
+                           MetaConfig config) {
+  PADDLE_ENFORCE_NE(y,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The y in InstanceNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      saved_mean,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The saved_mean in InstanceNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      saved_variance,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The saved_variance in InstanceNormInferMeta can't be nullptr."));
+  const auto x_dims = x.dims();
+  PADDLE_ENFORCE_NE(phi::product(x_dims),
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "The Input variable X has not "
+                        "been initialized. You may need to confirm "
+                        "if you put exe.run(startup_program) "
+                        "after optimizer.minimize function."));
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X must "
+          "greater than or equal to 2. But received: the shape of input "
+          "X = [%s], the dimension of input X =[%d]",
+          x_dims,
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "ShapeError: the dimension of input X must "
+          "smaller than or equal to 5, But received: the shape of input "
+          "X = [%s], the dimension of input X = [%d]",
+          x_dims,
+          x_dims.size()));
+  auto N = x_dims[0];
+  auto C = x_dims[1];
+  auto NxC = N * C;
+  const auto scale_ptr = scale.get_ptr();
+  if (scale_ptr) {
+    auto scale_dim = scale_ptr->dims();
+    PADDLE_ENFORCE_EQ(
+        scale_dim.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "ShapeError: the dimension of scale must equal to 1."
+            "But received: the shape of scale is [%s], the dimension "
+            "of scale is [%d]",
+            scale_dim,
+            scale_dim.size()));
+    bool check = !((!config.is_runtime) && (phi::product(scale_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(scale_dim[0],
+                        C,
+                        phi::errors::InvalidArgument(
+                            "ShapeError: the shape of scale must equal to [%d]"
+                            "But received: the shape of scale is [%d]",
+                            C,
+                            scale_dim[0]));
+    }
+  }
+  const auto bias_ptr = bias.get_ptr();
+  if (bias_ptr) {
+    auto bias_dim = bias_ptr->dims();
+    PADDLE_ENFORCE_EQ(
+        bias_dim.size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "ShapeError: the dimension of bias must equal to 1."
+            "But received: the shape of bias is [%s],the dimension "
+            "of bias is [%d]",
+            bias_dim,
+            bias_dim.size()));
+    bool check = !((!config.is_runtime) && (phi::product(bias_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(bias_dim[0],
+                        C,
+                        phi::errors::InvalidArgument(
+                            "ShapeError: the shape of bias must equal to [%d]"
+                            "But received: the shape of bias is [%d]",
+                            C,
+                            bias_dim[0]));
+    }
+  }
+  y->set_dims(x_dims);
+  saved_mean->set_dims({NxC});
+  saved_variance->set_dims({NxC});
+  y->share_lod(x);
+  y->set_dtype(x.dtype());
+  y->set_layout(x.layout());
+}
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 4f561e0adf19d..b2fb30a4da2d6 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -52,6 +52,15 @@ void ArangeInferMeta(const MetaTensor& start,
                      const MetaTensor& step,
                      MetaTensor* out);
 
+void InstanceNormInferMeta(const MetaTensor& x,
+                           paddle::optional<const MetaTensor&> scale,
+                           paddle::optional<const MetaTensor&> bias,
+                           float epsilon,
+                           MetaTensor* y,
+                           MetaTensor* saved_mean,
+                           MetaTensor* saved_variance,
+                           MetaConfig config = MetaConfig());
+
 void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index 07b3c5a18fdb5..dcb4289ae8d75 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -23,8 +23,22 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+
 namespace phi {
 
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
@@ -136,6 +150,188 @@ void InstanceNormGradKernel(const Context& dev_ctx,
                                    .broadcast(bcast));
 }
 
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  paddle::optional<const DenseTensor&> scale,
+                                  const DenseTensor& saved_mean,
+                                  const DenseTensor& saved_variance,
+                                  const DenseTensor& dy,
+                                  paddle::optional<const DenseTensor&> ddx,
+                                  paddle::optional<const DenseTensor&> ddscale,
+                                  paddle::optional<const DenseTensor&> ddbias,
+                                  float epsilon,
+                                  DenseTensor* dx,
+                                  DenseTensor* dscale,
+                                  DenseTensor* ddy) {
+  const auto* Scale = scale.get_ptr();
+  const auto* ddScale = ddscale.get_ptr();
+  const auto* ddX = ddx.get_ptr();
+  const auto* ddBias = ddbias.get_ptr();
+  phi::funcs::SetConstant<CPUContext, T> set_constant;
+  const auto& x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  const int sample_size = x.numel() / N / C;
+  const int NxC = N * C;
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor mean_tensor;
+  DenseTensor inv_var_tensor;
+  ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, NxC);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, NxC);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, NxC);
+
+  DenseTensor mean_tile;
+  mean_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&mean_tile);
+  EigenArrayMap<T> mean_tile_data(mean_tile.data<T>(), sample_size, NxC);
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&inv_var_tile);
+  EigenArrayMap<T> inv_var_tile_data(inv_var_tile.data<T>(), sample_size, NxC);
+
+  mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
+  inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    dev_ctx.template Alloc<T>(&Scale_data);
+    set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  DenseTensor scale_tile;
+  scale_tile.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&scale_tile);
+  EigenArrayMap<T> scale_tile_data(scale_tile.data<T>(), sample_size, NxC);
+  scale_tile_data = scale_arr.transpose().replicate(sample_size, N);
+  ConstEigenArrayMap<T> dy_arr(dy.data<T>(), sample_size, NxC);
+  ConstEigenArrayMap<T> ddx_arr(ddX->data<T>(), sample_size, NxC);
+  // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx,
+  //          axis=(h,w)) * np.sum(dy, axis=(h,w)) -
+  //          np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean),
+  //          axis=(h,w)) * inv_var.pow(2) *
+  //          np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW *
+  //          np.sum(ddx * (x - mean)) *
+  //          (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW *
+  //          np.sum(dy, axis=(h,w)) * (x - mean) *
+  //          (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var -
+  //          inv_var * np.mean(dy, axis=(h,w)) - inv_var.pow(3) *
+  //          (x - mean) * np.mean(dy * (x - mean),  axis=(h,w)))
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({sample_size, NxC});
+  dev_ctx.template Alloc<T>(&x_sub_mean_mul_invstd);
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      x_sub_mean_mul_invstd.data<T>(), sample_size, NxC);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+    set_constant(dev_ctx, dx, static_cast<T>(0));
+    EigenArrayMap<T> dx_arr(dx->data<T>(), sample_size, NxC);
+    if (ddX) {
+      dx_arr +=
+          x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+          sample_size *
+          (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size -
+           (dy_arr * ddx_arr).colwise().sum() +
+           3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() *
+               (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+               sample_size);
+      dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                sample_size * inv_var_tile_data * inv_var_tile_data *
+                (dy_arr.colwise().sum() / sample_size - dy_arr);
+      dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                sample_size * inv_var_tile_data * inv_var_tile_data *
+                (ddx_arr.colwise().sum() / sample_size - ddx_arr);
+      dx_arr = scale_tile_data * dx_arr;
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      DenseTensor ddscale_tile;
+      ddscale_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddscale_tile);
+      EigenArrayMap<T> ddscale_tile_data(
+          ddscale_tile.data<T>(), sample_size, NxC);
+      ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
+      dx_arr += (dy_arr * inv_var_tile_data -
+                 dy_arr.colwise().sum() / sample_size * inv_var_tile_data -
+                 x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                     (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                     sample_size) *
+                ddscale_tile_data;
+    }
+  }
+  if (dscale) {
+    // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) *
+    //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx
+    dev_ctx.template Alloc<T>(dscale);
+    set_constant(dev_ctx, dscale, static_cast<T>(0));
+    EigenVectorArrayMap<T> dscale_arr(dscale->data<T>(), C);
+    if (ddX) {
+      DenseTensor first_grad;
+      first_grad.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&first_grad);
+      set_constant(dev_ctx, &first_grad, static_cast<T>(0));
+      EigenArrayMap<T> first_grad_arr(first_grad.data<T>(), sample_size, NxC);
+      first_grad_arr +=
+          inv_var_tile_data *
+          (dy_arr -
+           dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
+           x_sub_mean_mul_invstd_arr *
+               (dy_arr * x_sub_mean_mul_invstd_arr)
+                   .colwise()
+                   .sum()
+                   .replicate(sample_size, 1) /
+               sample_size);
+      first_grad_arr = first_grad_arr * ddx_arr;
+      for (int nc = 0; nc < NxC; ++nc) {
+        int c = nc % C;
+        dscale_arr(c) += first_grad_arr.colwise().sum()(nc);
+      }
+    }
+  }
+  if (ddy) {
+    // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+    //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+    //           np.mean(ddx * (x - mean), axis=(h,w)))
+    dev_ctx.template Alloc<T>(ddy);
+    set_constant(dev_ctx, ddy, static_cast<T>(0));
+    EigenArrayMap<T> ddy_arr(ddy->data<T>(), sample_size, NxC);
+    if (ddX) {
+      ddy_arr += scale_tile_data * inv_var_tile_data *
+                 (ddx_arr - ddx_arr.colwise().sum() / sample_size -
+                  x_sub_mean_mul_invstd_arr *
+                      (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                      sample_size);
+    }
+    if (ddScale && ddBias) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      DenseTensor ddscale_tile;
+      ddscale_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddscale_tile);
+      EigenArrayMap<T> ddscale_tile_data(
+          ddscale_tile.data<T>(), sample_size, NxC);
+      ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N);
+
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      DenseTensor ddbias_tile;
+      ddbias_tile.Resize({sample_size, NxC});
+      dev_ctx.template Alloc<T>(&ddbias_tile);
+      EigenArrayMap<T> ddbias_tile_data(
+          ddbias_tile.data<T>(), sample_size, NxC);
+      ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+      ddy_arr += ddbias_tile_data;
+    }
+  }
+}
 }  // namespace phi
 
 PD_REGISTER_KERNEL(instance_norm_grad,
@@ -144,3 +340,9 @@ PD_REGISTER_KERNEL(instance_norm_grad,
                    phi::InstanceNormGradKernel,
                    float,
                    double) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/funcs/norm_utils.h b/paddle/phi/kernels/funcs/norm_utils.h
new file mode 100644
index 0000000000000..2d0a879e41c78
--- /dev/null
+++ b/paddle/phi/kernels/funcs/norm_utils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+inline void ExtractNCWHD(const phi::DDim &dims,
+                         const DataLayout &data_layout,
+                         int *N,
+                         int *C,
+                         int *H,
+                         int *W,
+                         int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index ad3b8579ddf67..e808ef644a246 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -20,7 +20,7 @@
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
-#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/layout_utils.h"
@@ -351,7 +351,7 @@ void BatchNormGradRawKernel(const Context &ctx,
           x_dims.size(),
           x_dims));
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
   // init output
   if (d_x) {
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 361e62e566035..e2aeec723628c 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -27,7 +27,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 #include "paddle/fluid/operators/norm_utils.cu.h"
-#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/layout_utils.h"
@@ -179,7 +179,7 @@ void BatchNormKernel(const Context &ctx,
 
   ctx.template Alloc<T>(y);
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+  phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
   auto dtype = paddle::platform::CudnnDataType<T>::type;
 
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 15c9c30626593..387127de48dea 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -14,16 +14,15 @@
 
 #include "paddle/phi/kernels/instance_norm_grad_kernel.h"
 
-#include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"
 
 namespace phi {
-
 template <typename T, int BlockDim>
 static __global__ void GradComputeDX(const T *dy,
                                      const BatchNormParamType<T> *scale,
@@ -37,16 +36,13 @@ static __global__ void GradComputeDX(const T *dy,
   int end_idx = (blockIdx.x + 1) * sample_size;
   int ncid = blockIdx.x;
   int c = ncid % C;
-
   BatchNormParamType<T> mean_val = mean[ncid];
   BatchNormParamType<T> inv_var_val = variance[ncid];
-
   typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage dy_storage;
   __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
   __shared__ BatchNormParamType<T> dy_sum_val;
   __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
   BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
   BatchNormParamType<T> dy_x_sub_mean_sum =
       static_cast<BatchNormParamType<T>>(0);
@@ -60,13 +56,11 @@ static __global__ void GradComputeDX(const T *dy,
   dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
   dy_x_sub_mean_sum =
       BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
-
   if (threadIdx.x == 0) {
     dy_sum_val = dy_sum;
     dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
   }
   __syncthreads();
-
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
     dx[i] =
         (static_cast<BatchNormParamType<T>>(dy[i]) -
@@ -77,6 +71,222 @@ static __global__ void GradComputeDX(const T *dy,
   }
 }
 
+static __device__ __forceinline__ float real_sqrt(float x) {
+  return 1. / sqrtf(x);
+}
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return 1. / sqrt(x);
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDX(const T *x,
+                                    const T *mean,
+                                    const T *variance,
+                                    const T *ddx,
+                                    const T *dy,
+                                    const T *scale,
+                                    const T *ddscale,
+                                    int C,
+                                    int sample_size,
+                                    const double epsilon,
+                                    T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T ddx_sum_val;
+  __shared__ T dy_mul_ddx_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  T dy_sum = 0;
+  T ddx_sum = 0;
+  T dy_mul_ddx_sum = 0;
+  T dy_mul_x_sub_mean_sum = 0;
+  T ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T ddx_i = ddx[i];
+    T dy_i = dy[i];
+    T tmp = x[i] - mean_val;
+
+    dy_sum += dy_i;
+    ddx_sum += ddx_i;
+    dy_mul_ddx_sum += (ddx_i * dy_i);
+
+    dy_mul_x_sub_mean_sum += (dy_i * tmp);
+    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+  }
+
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  dy_mul_ddx_sum =
+      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    ddx_sum_val = ddx_sum;
+    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dx[i] +=
+          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
+               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
+                3. * dy_mul_x_sub_mean_sum_val * var_val *
+                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
+           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (dy_sum_val / sample_size - dy[i]) +
+           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (ddx_sum_val / sample_size - ddx[i])) *
+          scale[c];
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
+                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
+                    var_val / sample_size) *
+               ddscale[c];
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDDY(const T *x,
+                                     const T *mean,
+                                     const T *variance,
+                                     const T *ddscale,
+                                     const T *ddbias,
+                                     const T *ddx,
+                                     const T *scale,
+                                     int C,
+                                     int sample_size,
+                                     const double epsilon,
+                                     T *ddy) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T ddx_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  T ddx_sum = 0;
+  T ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T ddx_i = ddx[i];
+    ddx_sum += ddx_i;
+    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
+  }
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    ddx_sum_val = ddx_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += scale[c] * var_val *
+                (ddx[i] - ddx_sum_val / sample_size -
+                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
+                     var_val / sample_size);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] += ddbias[c];
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void DoubleGradComputeDScale(const T *x,
+                                        const T *mean,
+                                        const T *variance,
+                                        const T *ddx,
+                                        const T *dy,
+                                        int C,
+                                        int sample_size,
+                                        const double epsilon,
+                                        T *dscale) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  T mean_val = mean[ncid];
+  T var_val = variance[ncid];
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+
+  T dy_sum = 0;
+  T dy_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    T dy_i = dy[i];
+    dy_sum += dy_i;
+    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    T dscale_tmp = 0;
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dscale_tmp +=
+          ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size -
+                              dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) *
+                                  var_val * var_val / sample_size);
+    }
+    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[ncid] += dscale_tmp;
+    }
+    __syncthreads();
+  }
+}
+
 template <typename T, typename Context>
 void InstanceNormGradKernel(const Context &dev_ctx,
                             const DenseTensor &x,
@@ -94,8 +304,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
   const auto &x_dims = x.dims();
 
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(
-      x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
   int NxC = N * C;
 
   DenseTensor x_tmp, d_y_tmp;
@@ -303,12 +512,120 @@ void InstanceNormGradKernel(const Context &dev_ctx,
       paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
 }
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context &dev_ctx,
+                                  const DenseTensor &x,
+                                  paddle::optional<const DenseTensor &> scale,
+                                  const DenseTensor &saved_mean,
+                                  const DenseTensor &saved_variance,
+                                  const DenseTensor &dy,
+                                  paddle::optional<const DenseTensor &> ddx,
+                                  paddle::optional<const DenseTensor &> ddscale,
+                                  paddle::optional<const DenseTensor &> ddbias,
+                                  float epsilon_f,
+                                  DenseTensor *dx,
+                                  DenseTensor *dscale,
+                                  DenseTensor *ddy) {
+  const auto *Scale = scale.get_ptr();
+  const auto *ddX = ddx.get_ptr();
+  const auto *ddScale = ddscale.get_ptr();
+  const auto *ddBias = ddbias.get_ptr();
+  const double epsilon = static_cast<double>(epsilon_f);
+  const T *x_data = x.data<T>();
+  const T *dy_data = dy.data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
+  const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
+  const T *mean_data = saved_mean.data<T>();
+  const T *variance_data = saved_variance.data<T>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  auto &x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  const int n = x.numel();
+  int sample_size = n / N / C;
+
+  DenseTensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.Resize({C});
+    dev_ctx.template Alloc<T>(&scale_tmp);
+    set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = NxC;
+  const int grid1 = (C + block - 1) / block;
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+    DoubleGradComputeDX<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddx_data,
+        dy_data,
+        scale_data,
+        ddscale_data,
+        C,
+        sample_size,
+        epsilon,
+        dx_data);
+  }
+  if (dscale) {
+    DenseTensor dscale_tmp;
+    dscale_tmp.Resize({NxC});
+    dev_ctx.template Alloc<T>(&dscale_tmp);
+    set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
+    T *dscale_tmp_data = dscale_tmp.data<T>();
+
+    T *dscale_data = dev_ctx.template Alloc<T>(dscale);
+    set_zero(dev_ctx, dscale, static_cast<T>(0));
+    DoubleGradComputeDScale<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddx_data,
+        dy_data,
+        C,
+        sample_size,
+        epsilon,
+        dscale_tmp_data);
+    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<T>(), dscale->data<T>(), N, C);
+  }
+  if (ddy) {
+    T *ddy_data = dev_ctx.template Alloc<T>(ddy);
+    set_zero(dev_ctx, ddy, static_cast<T>(0));
+    DoubleGradComputeDDY<T, block><<<grid, block, 0, dev_ctx.stream()>>>(
+        x_data,
+        mean_data,
+        variance_data,
+        ddscale_data,
+        ddbias_data,
+        ddx_data,
+        scale_data,
+        C,
+        sample_size,
+        epsilon,
+        ddy_data);
+  }
+}
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
 PD_REGISTER_KERNEL(
     instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float) {}
 #else
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
@@ -316,4 +633,10 @@ PD_REGISTER_KERNEL(instance_norm_grad,
                    phi::InstanceNormGradKernel,
                    float,
                    double) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index cf8f0fb78788c..81d9400750190 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/phi/kernels/instance_norm_kernel.h"
 
-#include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/gpu/instance_norm_utils.h"
 
 namespace phi {
@@ -51,8 +51,7 @@ void InstanceNormKernel(const Context &dev_ctx,
                         "the size of X's dimensions is [%d]",
                         x_dims.size()));
   int N, C, H, W, D;
-  paddle::operators::ExtractNCWHD(
-      x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
   int NxC = N * C;
   DenseTensor x_tmp;
   x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
index 041302a7cfb67..7924c767ab61e 100644
--- a/paddle/phi/kernels/instance_norm_grad_kernel.h
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -30,4 +30,19 @@ void InstanceNormGradKernel(const Context& dev_ctx,
                             DenseTensor* scale_grad,
                             DenseTensor* bias_grad);
 
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  paddle::optional<const DenseTensor&> scale,
+                                  const DenseTensor& saved_mean,
+                                  const DenseTensor& saved_variance,
+                                  const DenseTensor& dy,
+                                  paddle::optional<const DenseTensor&> ddx,
+                                  paddle::optional<const DenseTensor&> ddscale,
+                                  paddle::optional<const DenseTensor&> ddbias,
+                                  float epsilon,
+                                  DenseTensor* dx,
+                                  DenseTensor* dscale,
+                                  DenseTensor* ddy);
+
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/instance_norm_sig.cc b/paddle/phi/ops/compat/instance_norm_sig.cc
index b65e84588db13..2b490078512b1 100644
--- a/paddle/phi/ops/compat/instance_norm_sig.cc
+++ b/paddle/phi/ops/compat/instance_norm_sig.cc
@@ -31,8 +31,26 @@ KernelSignature InstanceNormGradOpArgumentMapping(
                          {"epsilon"},
                          {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
 }
+KernelSignature InstanceNormDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("instance_norm_double_grad",
+                         {"X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "DY",
+                          "DDX",
+                          "DDScale",
+                          "DDBias"},
+                         {"epsilon"},
+                         {"DX", "DScale", "DDY"});
+}
 }  // namespace phi
 
+PD_REGISTER_BASE_KERNEL_NAME(instance_norm_grad_grad,
+                             instance_norm_double_grad);
 PD_REGISTER_ARG_MAPPING_FN(instance_norm, phi::InstanceNormOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad,
                            phi::InstanceNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(instance_norm_grad_grad,
+                           phi::InstanceNormDoubleGradOpArgumentMapping);

From 21f11d350cc348c5c2509d0935b7c2344c3d2f76 Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Fri, 27 May 2022 10:51:30 +0800
Subject: [PATCH 049/109] Support memory stats for CPU (#42945)

* Support memory stats for CPU

* Add UTs

* Fix typos

* Fix typos
---
 paddle/fluid/memory/CMakeLists.txt            |   1 +
 .../memory/allocation/allocator_facade.cc     |   5 +-
 .../fluid/memory/allocation/stat_allocator.h  |  20 +-
 .../fluid/memory/detail/system_allocator.cc   |   6 +
 paddle/fluid/memory/memory_stats_test.cc      |  64 +++++++
 paddle/fluid/memory/stats.cc                  |  92 ++++++----
 paddle/fluid/memory/stats.h                   | 171 +++++++++++-------
 paddle/fluid/memory/stats_test.cc             | 158 +++++++++++-----
 paddle/fluid/operators/conv_cudnn_helper.h    |   6 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |  19 +-
 paddle/fluid/platform/profiler_helper.h       |   6 +-
 paddle/fluid/pybind/pybind.cc                 |   5 +-
 python/paddle/device/cuda/__init__.py         |   8 +-
 13 files changed, 385 insertions(+), 176 deletions(-)
 create mode 100644 paddle/fluid/memory/memory_stats_test.cc

diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 76bb8993cbefa..53e7993945586 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -13,6 +13,7 @@ cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
 cc_library(stats SRCS stats.cc DEPS enforce)
 cc_library(memory DEPS malloc memcpy stats)
 
+cc_test(memory_stats_test SRCS memory_stats_test.cc DEPS memory)
 cc_test(stats_test SRCS stats_test.cc DEPS stats)
 
 if (WITH_GPU)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 35ad27f4c62b5..99152607158eb 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -931,10 +931,7 @@ class AllocatorFacadePrivate {
 
   void WrapStatAllocator() {
     for (auto& pair : allocators_) {
-      // Now memory stats is only supported for GPU
-      if (platform::is_gpu_place(pair.first)) {
-        pair.second = std::make_shared<StatAllocator>(pair.second);
-      }
+      pair.second = std::make_shared<StatAllocator>(pair.second);
     }
   }
 
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 71569366c2446..68209bbaabeca 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -30,16 +30,28 @@ class StatAllocator : public Allocator {
 
  protected:
   void FreeImpl(phi::Allocation* allocation) override {
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       -allocation->size());
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              -allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                -allocation->size());
+    }
+
     underlying_allocator_->Free(allocation);
   }
 
   phi::Allocation* AllocateImpl(size_t size) override {
     phi::Allocator::AllocationPtr allocation =
         underlying_allocator_->Allocate(size);
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       allocation->size());
+
+    if (platform::is_cpu_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                              allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+                                allocation->size());
+    }
     return allocation.release();
   }
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 37ac0b4483291..06038804e6efe 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
+#include "paddle/fluid/memory/stats.h"
+
 #ifdef _WIN32
 #include <malloc.h>
 #ifndef NOMINMAX
@@ -92,6 +94,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
     }
   }
 
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+
   return p;
 }
 
@@ -108,6 +112,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #else
   free(p);
 #endif
+
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
diff --git a/paddle/fluid/memory/memory_stats_test.cc b/paddle/fluid/memory/memory_stats_test.cc
new file mode 100644
index 0000000000000..b2fc602e401ed
--- /dev/null
+++ b/paddle/fluid/memory/memory_stats_test.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/memory.h"
+#include <algorithm>
+#include <vector>
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+
+TEST(stat_allocator_test, host_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CPUPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(HostMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(HostMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+TEST(stat_allocator_test, device_memory_stat_test) {
+  std::vector<int64_t> alloc_sizes{
+      5278, 9593, 8492, 5041, 3351, 4232, 3706, 5963, 5896, 5057, 7527,
+      6235, 0,    7810, 940,  1239, 1945, 789,  2891, 7553, 8046, 2685,
+      1332, 6547, 5238, 5345, 1133, 5475, 9137, 3111, 8478, 6350, 9395,
+      4,    1185, 2186, 357,  9774, 6743, 6136, 7073, 7674, 5640, 3935,
+      528,  6699, 9821, 8717, 2264, 4708, 9936, 3566, 1373, 6955, 3694,
+      221,  309,  3617, 3793, 3334, 7281, 1302};
+
+  int64_t max_alloc_size = 0;
+  for (int64_t size : alloc_sizes) {
+    AllocationPtr allocation = Alloc(platform::CUDAPlace(), size);
+    int64_t alloc_size = static_cast<int64_t>(allocation->size());
+    max_alloc_size = std::max(max_alloc_size, alloc_size);
+    EXPECT_EQ(DeviceMemoryStatCurrentValue("Allocated", 0), alloc_size);
+  }
+  EXPECT_EQ(DeviceMemoryStatPeakValue("Allocated", 0), max_alloc_size);
+}
+#endif
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc
index 31d776de40702..97197b495f5fc 100644
--- a/paddle/fluid/memory/stats.cc
+++ b/paddle/fluid/memory/stats.cc
@@ -38,7 +38,7 @@ class StatRegistry {
   }
 
   std::string GetStatKey(const std::string& stat_type, int dev_id) {
-    return "STAT_Device" + std::to_string(dev_id) + "_" + stat_type;
+    return stat_type + std::to_string(dev_id);
   }
 
   int64_t GetCurrentValue(const std::string& stat_type, int dev_id) {
@@ -49,6 +49,10 @@ class StatRegistry {
     return GetStat(stat_type, dev_id)->GetPeakValue();
   }
 
+  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
+    GetStat(stat_type, dev_id)->Update(increment);
+  }
+
   void Register(const std::string& stat_type, int dev_id, StatBase* stat) {
     std::lock_guard<SpinLock> lock_guard(stat_map_lock_);
     stat_map_[GetStatKey(stat_type, dev_id)] = stat;
@@ -59,10 +63,6 @@ class StatRegistry {
     stat_map_.erase(GetStatKey(stat_type, dev_id));
   }
 
-  void Update(const std::string& stat_type, int dev_id, int64_t increment) {
-    stat_map_[GetStatKey(stat_type, dev_id)]->Update(increment);
-  }
-
  private:
   StatRegistry() = default;
 
@@ -72,43 +72,67 @@ class StatRegistry {
   SpinLock stat_map_lock_;
 };
 
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetCurrentValue(stat_type, dev_id);
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Device" + stat_type,
+                                                      dev_id);
 }
 
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id) {
-  return StatRegistry::GetInstance()->GetPeakValue(stat_type, dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Device" + stat_type,
+                                                   dev_id);
 }
 
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment) {
-  StatRegistry::GetInstance()->Update(stat_type, dev_id, increment);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment) {
+  StatRegistry::GetInstance()->Update("Device" + stat_type, dev_id, increment);
 }
 
-#define MEMORY_STAT_REGISTER_WITH_ID(item, id) \
-  StatRegistry::GetInstance()->Register(       \
-      #item, id, Stat<ThreadLocalStatDevice##id##item>::GetInstance());
-
-#define MEMORY_STAT_REGISTER(item)        \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
-  MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetCurrentValue("Host" + stat_type,
+                                                      dev_id);
+}
+
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id) {
+  return StatRegistry::GetInstance()->GetPeakValue("Host" + stat_type, dev_id);
+}
+
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment) {
+  StatRegistry::GetInstance()->Update("Host" + stat_type, dev_id, increment);
+}
+
+#define DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, id) \
+  StatRegistry::GetInstance()->Register(              \
+      "Device" #item, id, Stat<DeviceMemoryStat##item##id>::GetInstance());
+
+#define DEVICE_MEMORY_STAT_REGISTER(item)        \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_REGISTER_WITH_ID(item, 15)
+
+#define HOST_MEMORY_STAT_REGISTER(item)  \
+  StatRegistry::GetInstance()->Register( \
+      "Host" #item, 0, Stat<HostMemoryStat##item##0>::GetInstance());
 
 int RegisterAllStats() {
-  MEMORY_STAT_REGISTER(Allocated);
-  MEMORY_STAT_REGISTER(Reserved);
+  DEVICE_MEMORY_STAT_REGISTER(Allocated);
+  DEVICE_MEMORY_STAT_REGISTER(Reserved);
+
+  HOST_MEMORY_STAT_REGISTER(Allocated);
+  HOST_MEMORY_STAT_REGISTER(Reserved);
   return 0;
 }
 
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index b4850a8e9e919..bb6a3cca6644c 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -91,82 +91,113 @@ class Stat : public StatBase {
   std::atomic<int64_t> peak_value_{0};
 };
 
-// StatGetCurrentValue, StatGetPeakValue and StatUpdate support to operate STAT
-// values by a string, however, they has worse performance than the macro
-// function MEMORY_STAT_CURRENT_VALUE, MEMORY_STAT_PEAK_VALUE, and
-// MEMORY_STAT_UPDATE. Try to use the macro functions where ultra-low
-// performance overhead is required.
-int64_t StatGetCurrentValue(const std::string& stat_type, int dev_id);
-int64_t StatGetPeakValue(const std::string& stat_type, int dev_id);
-void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
-
-#define MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)                          \
-  case id:                                                               \
-    stat = paddle::memory::Stat<                                         \
-        paddle::memory::ThreadLocalStatDevice##id##item>::GetInstance(); \
+// xxxMemoryStatCurrentValue, xxxMemoryStatPeakValue and xxxMemoryStatUpdate
+// support to operate STAT values by a string, however, they has worse
+// performance than the macro function xxx_MEMORY_STAT_CURRENT_VALUE,
+// xxx_MEMORY_STAT_PEAK_VALUE, and xxx_MEMORY_STAT_UPDATE. Try to use the macro
+// functions where ultra-low performance overhead is required.
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t DeviceMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void DeviceMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                            int64_t increment);
+
+int64_t HostMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+int64_t HostMemoryStatPeakValue(const std::string& stat_type, int dev_id);
+void HostMemoryStatUpdate(const std::string& stat_type, int dev_id,
+                          int64_t increment);
+
+#define DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, id)              \
+  case id:                                                          \
+    stat = paddle::memory::Stat<                                    \
+        paddle::memory::DeviceMemoryStat##item##id>::GetInstance(); \
     break
 
-#define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  [&] {                                                               \
-    paddle::memory::StatBase* stat = nullptr;                         \
-    switch (id) {                                                     \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                         \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                        \
-      MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                        \
-      default:                                                        \
-        PADDLE_THROW(paddle::platform::errors::OutOfRange(            \
-            "Only support device id between [0, 15] in memory stats," \
-            "not support device id: %d",                              \
-            id));                                                     \
-        break;                                                        \
-    }                                                                 \
-    return stat->func(__VA_ARGS__);                                   \
+#define DEVICE_MEMORY_STAT_FUNC(item, id, func, ...)                          \
+  [&] {                                                                       \
+    paddle::memory::StatBase* stat = nullptr;                                 \
+    switch (id) {                                                             \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 1);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 2);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 3);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 4);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 5);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 6);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 7);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 8);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 9);                          \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 10);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 11);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 12);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 13);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 14);                         \
+      DEVICE_MEMORY_STAT_FUNC_SWITHCH_CASE(item, 15);                         \
+      default:                                                                \
+        PADDLE_THROW(paddle::platform::errors::OutOfRange(                    \
+            "Only support device id between [0, 15] for device memory stats," \
+            "not support device id: %d",                                      \
+            id));                                                             \
+        break;                                                                \
+    }                                                                         \
+    return stat->func(__VA_ARGS__);                                           \
   }()
 
-#define MEMORY_STAT_CURRENT_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetCurrentValue)
-#define MEMORY_STAT_PEAK_VALUE(item, id) \
-  MEMORY_STAT_FUNC(item, id, GetPeakValue)
-#define MEMORY_STAT_UPDATE(item, id, increment) \
-  MEMORY_STAT_FUNC(item, id, Update, increment)
-
-#define MEMORY_STAT_DECLARE_WITH_ID(item, id) \
-  struct ThreadLocalStatDevice##id##item : public ThreadLocalStatBase {};
-
-#define MEMORY_STAT_DECLARE(item)        \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
-  MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+#define DEVICE_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define DEVICE_MEMORY_STAT_PEAK_VALUE(item, id) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define DEVICE_MEMORY_STAT_UPDATE(item, id, increment) \
+  DEVICE_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define HOST_MEMORY_STAT_FUNC(item, id, func, ...)                           \
+  [&] {                                                                      \
+    PADDLE_ENFORCE_EQ(id, 0, paddle::platform::errors::OutOfRange(           \
+                                 "Only support device id 0 for host memory " \
+                                 "stats, not support device id: %d",         \
+                                 id));                                       \
+    return paddle::memory::Stat<                                             \
+               paddle::memory::HostMemoryStat##item##0>::GetInstance()       \
+        ->func(__VA_ARGS__);                                                 \
+  }()
+
+#define HOST_MEMORY_STAT_CURRENT_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetCurrentValue)
+#define HOST_MEMORY_STAT_PEAK_VALUE(item, id) \
+  HOST_MEMORY_STAT_FUNC(item, id, GetPeakValue)
+#define HOST_MEMORY_STAT_UPDATE(item, id, increment) \
+  HOST_MEMORY_STAT_FUNC(item, id, Update, increment)
+
+#define DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, id) \
+  struct DeviceMemoryStat##item##id : public ThreadLocalStatBase {}
+
+#define DEVICE_MEMORY_STAT_DECLARE(item)        \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 0);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 1);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 2);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 3);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 4);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 5);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 6);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 7);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 8);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 9);  \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 10); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 11); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 12); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 13); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 14); \
+  DEVICE_MEMORY_STAT_DECLARE_WITH_ID(item, 15)
+
+// Only support id 0 for host memory stat
+#define HOST_MEMORY_STAT_DECLARE(item) \
+  struct HostMemoryStat##item##0 : public ThreadLocalStatBase{};
 
 // To add a new STAT type, declare here and register in stats.cc
-MEMORY_STAT_DECLARE(Allocated);
-MEMORY_STAT_DECLARE(Reserved);
+DEVICE_MEMORY_STAT_DECLARE(Allocated);
+DEVICE_MEMORY_STAT_DECLARE(Reserved);
+
+HOST_MEMORY_STAT_DECLARE(Allocated);
+HOST_MEMORY_STAT_DECLARE(Reserved);
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/stats_test.cc b/paddle/fluid/memory/stats_test.cc
index 436c737916d9f..bcaba8e91080f 100644
--- a/paddle/fluid/memory/stats_test.cc
+++ b/paddle/fluid/memory/stats_test.cc
@@ -23,50 +23,77 @@
 namespace paddle {
 namespace memory {
 
-TEST(stats_test, MultiThreadReadWriteTest) {
-  std::string stat_type = "Allocated";
-  size_t thread_num = 3;
-  size_t data_num = 10;
-
-  std::condition_variable cv;
-  std::mutex mutex;
-  std::vector<std::thread> threads;
-  size_t ready_thread_num = 0;
-
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back(
-        [&stat_type, data_num, &cv, &mutex, &ready_thread_num]() {
-          for (size_t data = 0; data < data_num; ++data) {
-            StatUpdate(stat_type, 0, data);
-          }
-          /* lock guard*/ {
-            std::lock_guard<std::mutex> lock_guard{mutex};
-            ++ready_thread_num;
-            cv.notify_one();
-          }
-          // Sleep here to not exit before the main thread checking stat
-          // results, because the thread-local stat data will be destroyed when
-          // the thread exit
-          std::this_thread::sleep_for(std::chrono::seconds(1));
-        });
+class StatsTest : public ::testing::Test {
+ protected:
+  void SetStatType(const std::string& stat_type) { stat_type_ = stat_type; }
+
+  void SetFunc(
+      std::function<void(const std::string, int, int64_t)> update_func,
+      std::function<int64_t(const std::string, int)> current_value_func,
+      std::function<int64_t(const std::string, int)> peak_value_func) {
+    update_func_ = update_func;
+    current_value_func_ = current_value_func;
+    peak_value_func_ = peak_value_func;
+  }
+
+  void RunTests() {
+    MultiThreadReadWriteTest();
+    PeakValueTest();
   }
 
-  std::unique_lock<std::mutex> unique_lock(mutex);
-  cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
-    return ready_thread_num == thread_num;
-  });
+ private:
+  void MultiThreadReadWriteTest() {
+    size_t thread_num = 3;
+    size_t data_num = 10;
+
+    std::condition_variable cv;
+    std::mutex mutex;
+    std::vector<std::thread> threads;
+    size_t ready_thread_num = 0;
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads.emplace_back([&]() {
+        for (size_t data = 0; data < data_num; ++data) {
+          update_func_(stat_type_, 0, data);
+        }
+        /* lock guard*/ {
+          std::lock_guard<std::mutex> lock_guard{mutex};
+          ++ready_thread_num;
+          cv.notify_one();
+        }
+        // Sleep here to not exit before the main thread checking stat
+        // results, because the thread-local stat data will be destroyed when
+        // the thread exit
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+      });
+    }
 
-  EXPECT_EQ(StatGetCurrentValue(stat_type, 0),
-            int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+    std::unique_lock<std::mutex> unique_lock(mutex);
+    cv.wait(unique_lock, [&ready_thread_num, thread_num]() {
+      return ready_thread_num == thread_num;
+    });
 
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads[i].join();
+    EXPECT_EQ(current_value_func_(stat_type_, 0),
+              int64_t((thread_num * data_num * (data_num - 1)) >> 1));
+
+    for (size_t i = 0; i < thread_num; ++i) {
+      threads[i].join();
+    }
+  }
+
+  void PeakValueTest() {
+    int64_t peak_value = ((int64_t)1) << 63;
+    int64_t sum = 0;
+    for (int64_t data : datas_) {
+      update_func_(stat_type_, 0, data);
+      sum += data;
+      peak_value = std::max(peak_value, sum);
+    }
+    EXPECT_EQ(peak_value_func_(stat_type_, 0), peak_value);
   }
-}
 
-TEST(stats_test, PeakValueTest) {
-  std::string stat_type = "Allocated";
-  std::vector<int64_t> datas = {
+  std::string stat_type_;
+  std::vector<int64_t> datas_{
       543149808935355, 634698327471328, 706215795436611, 577939367795333,
       419479490054362, 21975227714595,  812939817942250, 984428837942082,
       537304104446806, 685008544452453, 563352858161268, 690143831596330,
@@ -93,14 +120,53 @@ TEST(stats_test, PeakValueTest) {
       746465732805300, -74049761897414, -65640372433924, 852009039806484,
       305079802044257, -48409757869238, 266031781660228, 327287322379820};
 
-  int64_t peak_value = ((int64_t)1) << 63;
-  int64_t sum = 0;
-  for (int64_t data : datas) {
-    StatUpdate(stat_type, 0, data);
-    sum += data;
-    peak_value = std::max(peak_value, sum);
-  }
-  EXPECT_EQ(StatGetPeakValue(stat_type, 0), peak_value);
+  std::function<void(const std::string, int, int64_t)> update_func_;
+  std::function<int64_t(const std::string, int)> current_value_func_;
+  std::function<int64_t(const std::string, int)> peak_value_func_;
+};
+
+TEST_F(StatsTest, DeviceAllocatedTest) {
+  SetStatType("Allocated");
+  SetFunc(DeviceMemoryStatUpdate, DeviceMemoryStatCurrentValue,
+          DeviceMemoryStatPeakValue);
+  RunTests();
+}
+
+TEST_F(StatsTest, DeviceReservedMacroTest) {
+  SetStatType("Reserved");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return DEVICE_MEMORY_STAT_UPDATE(Reserved, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, id);
+      },
+      [](const std::string stat_type, int id) {
+        return DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostAllocatedMacroTest) {
+  SetStatType("Allocated");
+  SetFunc(
+      [](const std::string stat_type, int id, int64_t increment) {
+        return HOST_MEMORY_STAT_UPDATE(Allocated, id, increment);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, id);
+      },
+      [](const std::string stat_type, int id) {
+        return HOST_MEMORY_STAT_PEAK_VALUE(Allocated, id);
+      });
+  RunTests();
+}
+
+TEST_F(StatsTest, HostReservedTest) {
+  SetStatType("Reserved");
+  SetFunc(HostMemoryStatUpdate, HostMemoryStatCurrentValue,
+          HostMemoryStatPeakValue);
+  RunTests();
 }
 
 }  // namespace memory
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 419fb8a4ca703..3044aa6cf6c5a 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -72,8 +72,10 @@ static inline bool UseFixedWorkspace() {
 static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
   if (!use_fixed_workspace) {
     int device_id = platform::GetCurrentDeviceId();
-    int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
-    int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
+    int64_t allocated =
+        memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+    int64_t reserved =
+        memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
     int64_t availble = platform::GpuAvailableMemToAlloc();
     VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
             << " MB, reserved=" << ToMegaBytes(reserved)
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 6da5d1244fbed..5410638ceb39a 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -149,8 +149,8 @@ class RecordedGpuMallocHelper {
     if (FLAGS_enable_gpu_memory_usage_log) {
       // A fake UPDATE to trigger the construction of memory stat instances,
       // make sure that they are destructed after RecordedGpuMallocHelper.
-      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
-      MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, dev_id, 0);
     }
   }
 
@@ -161,15 +161,18 @@ class RecordedGpuMallocHelper {
     if (FLAGS_enable_gpu_memory_usage_log) {
       if (FLAGS_enable_gpu_memory_usage_log_mb) {
         std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = "
-                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) /
+                         1048576.0
                   << ", Allocated = "
-                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) /
+                         1048576.0
                   << std::endl;
       } else {
         std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = "
-                  << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_)
                   << ", Allocated = "
-                  << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl;
+                  << DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_)
+                  << std::endl;
       }
     }
   }
@@ -230,7 +233,7 @@ class RecordedGpuMallocHelper {
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
 
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
@@ -269,7 +272,7 @@ class RecordedGpuMallocHelper {
       PADDLE_ENFORCE_GPU_SUCCESS(err);
       cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
     } else {
       platform::GpuGetLastError();  // clear the error flag when
                                     // cudaErrorCudartUnloading /
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 24c515f5b4956..f64e05504aa3f 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -168,8 +168,10 @@ void PrintMemProfiler(
   if (num_gpus > 0) {
     std::cout << "GPU Memory Usage (MB):\n";
     for (int dev_id = 0; dev_id < num_gpus; ++dev_id) {
-      int64_t allocated = memory::StatGetCurrentValue("Allocated", dev_id);
-      int64_t reserved = memory::StatGetCurrentValue("Reserved", dev_id);
+      int64_t allocated =
+          memory::DeviceMemoryStatCurrentValue("Allocated", dev_id);
+      int64_t reserved =
+          memory::DeviceMemoryStatCurrentValue("Reserved", dev_id);
       size_t available = 0, total = 0, actual_available = 0, actual_total = 0;
       RecordedGpuMemGetInfo(&available, &total, &actual_available,
                             &actual_total, dev_id);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f6be9b66d5dbd..0e1271c1fe07f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3005,8 +3005,9 @@ All parameter, weight, gradient are variables in Paddle.
     }
     return stats_map;
   });
-  m.def("memory_stat_get_current", memory::StatGetCurrentValue);
-  m.def("memory_stat_get_peak", memory::StatGetPeakValue);
+  m.def("device_memory_stat_current_value",
+        memory::DeviceMemoryStatCurrentValue);
+  m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
   m.def("run_cmd",
         [](const std::string &cmd, int time_out = -1,
            int sleep_inter = -1) -> const std::string {
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index b33dc1aaeb086..8cb4f5f765611 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -224,7 +224,7 @@ def max_memory_allocated(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Allocated", device_id)
+    return core.device_memory_stat_peak_value("Allocated", device_id)
 
 
 def max_memory_reserved(device=None):
@@ -255,7 +255,7 @@ def max_memory_reserved(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_peak("Reserved", device_id)
+    return core.device_memory_stat_peak_value("Reserved", device_id)
 
 
 def memory_allocated(device=None):
@@ -290,7 +290,7 @@ def memory_allocated(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Allocated", device_id)
+    return core.device_memory_stat_current_value("Allocated", device_id)
 
 
 def memory_reserved(device=None):
@@ -321,7 +321,7 @@ def memory_reserved(device=None):
             f"The API {name} is not supported in CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support to call this API."
         )
     device_id = extract_cuda_device_id(device, op_name=name)
-    return core.memory_stat_get_current("Reserved", device_id)
+    return core.device_memory_stat_current_value("Reserved", device_id)
 
 
 def _set_current_stream(stream):

From 905d857ca8c41efca52bc817d9a99892fdf948b3 Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Fri, 27 May 2022 11:17:37 +0800
Subject: [PATCH 050/109] fix_sharding_timeout (#43002)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index d6d76e0437061..2918e8501c3d0 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1152,7 +1152,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 350)
     set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)

From 668e235cef7d1ee20d3a721e430103e508121604 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Fri, 27 May 2022 13:54:39 +0800
Subject: [PATCH 051/109] change einsum_v2 as default and add new flags:
 FLAG_einsum_opt=1|0 (#43010)

---
 paddle/fluid/platform/flags.cc                     | 13 +++++++++++++
 paddle/phi/kernels/impl/einsum_impl.h              |  6 ++++--
 python/paddle/fluid/tests/unittests/test_einsum.py |  3 +++
 python/paddle/tensor/einsum.py                     |  2 +-
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 600a4cbcc3ed9..2fcc573456d42 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -848,3 +848,16 @@ PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
  * Example:
  */
 PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
+
+/**
+ * Preformance related FLAG
+ * Name: einsum_opt
+ * Since Version: 2.3.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, EinsumOp will be optimimzed by innercache reuse, which
+ * uses more gpu memory.
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    einsum_opt, false,
+    "EinsumOp backward will be speedup at the expense of more gpu memory.");
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 5e4480426c0cc..bfbd6e0c51cfc 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -20,6 +20,8 @@
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/string/string_helper.h"
 
+DECLARE_bool(einsum_opt);
+
 namespace phi {
 
 // check the validation of the Einsum equation.
@@ -456,7 +458,7 @@ DenseTensor PerformContraction(
     }
     // reduction
     DenseTensor trans_t;
-    if (use_cache && cache[operand_idx] != nullptr &&
+    if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr &&
         cache[operand_idx]->IsInitialized()) {
       trans_t.ShareBufferWith(*(cache[operand_idx]));
       VLOG(5) << "Cache Used!";
@@ -465,7 +467,7 @@ DenseTensor PerformContraction(
           dev_ctx, t, perm, all_labels, ellipsis, label2type);
       trans_t = PerformTranspose<T, Context>(
           dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type);
-      if (cache[operand_idx] != nullptr)
+      if (FLAGS_einsum_opt && cache[operand_idx] != nullptr)
         cache[operand_idx]->ShareBufferWith(trans_t);
     }
     auto mul_dims = GetShapeByType<int>(all_labels,
diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py
index 43b5ce96a3901..26aaf0f44f1d2 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum.py
@@ -18,6 +18,9 @@
 import paddle
 from paddle.fluid import core
 
+import os
+os.environ['FLAGS_new_einsum'] = "0"
+
 
 class TestErrors(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 4cdbebb055229..49cc426a00fd9 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -983,7 +983,7 @@ def einsum(equation, *operands):
         #     [0.51476848, 0.23367381, 0.39229113]]])
     """
     import os
-    if int(os.environ.get('FLAGS_new_einsum', "0")):
+    if int(os.environ.get('FLAGS_new_einsum', "1")):
         return einsum_v2(equation, *operands)
 
     nop = len(operands)

From 3d9fe71e3c043b715134f9991a85a6afb4cd6423 Mon Sep 17 00:00:00 2001
From: Haipeng Wang <wanghaipeng03@baidu.com>
Date: Fri, 27 May 2022 14:52:51 +0800
Subject: [PATCH 052/109] experimental nvcc-lazy-module-loading (#43037)

* experimental nvcc-lazy-module-loading

* remove two empty last line from two files
---
 CMakeLists.txt                                |  1 +
 cmake/experimental.cmake                      | 17 +++++++
 .../cuda_module_loading_lazy.cmake            | 40 +++++++++++++++
 tools/nvcc_lazy                               | 49 +++++++++++++++++++
 4 files changed, 107 insertions(+)
 create mode 100644 cmake/experimental.cmake
 create mode 100644 cmake/experiments/cuda_module_loading_lazy.cmake
 create mode 100755 tools/nvcc_lazy

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 433081ee2256b..f3ed08d56e6d6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic)            # simplify cmake module
+include(experimental)       # experimental build options
 
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
diff --git a/cmake/experimental.cmake b/cmake/experimental.cmake
new file mode 100644
index 0000000000000..55e7fe263f9dc
--- /dev/null
+++ b/cmake/experimental.cmake
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options
+
+include(experiments/cuda_module_loading_lazy)
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
new file mode 100644
index 0000000000000..ef6a51b594b9e
--- /dev/null
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# this file contains experimental build options for lazy cuda module loading
+# cuda moduel lazy loading is supported by CUDA 11.6+
+# this experiment option makes Paddle supports lazy loading before CUDA 11.6.
+
+option(EXP_CUDA_MODULE_LOADING_LAZY  "enable lazy cuda module loading" OFF)
+if (${EXP_CUDA_MODULE_LOADING_LAZY})
+  if (NOT ${ON_INFER} OR NOT ${LINUX})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms")
+    return()
+  endif ()
+  if (NOT ${CUDA_FOUND})
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
+    return()
+  endif ()
+  if (${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6")
+    message("cuda 11.6+ already support lazy module loading")
+    return()
+  endif ()
+
+  message("for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a")
+  set(CUDA_USE_STATIC_CUDA_RUNTIME OFF CACHE BOOL "" FORCE)
+  set(CMAKE_CUDA_FLAGS "--cudart shared")
+  enable_language(CUDA)
+  set(CUDA_NVCC_EXECUTABLE "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+  set(CMAKE_CUDA_COMPILER "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" CACHE FILEPATH "" FORCE)
+endif()
diff --git a/tools/nvcc_lazy b/tools/nvcc_lazy
new file mode 100755
index 0000000000000..9cb49b04ffaff
--- /dev/null
+++ b/tools/nvcc_lazy
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY
+
+# check nvcc version, if nvcc >= 11.6, just run nvcc itself
+CUDA_VERSION=$(nvcc --version | grep -oP '(?<=cuda_)\d*\.\d*')
+CUDA_VERSION_MAJOR=${CUDA_VERSION%.*}
+CUDA_VERSION_MINOR=${CUDA_VERSION#*.}
+if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 6) )); then
+  nvcc "$@"
+  exit
+fi
+
+BUILDDIR=$(mktemp -d  /tmp/nvcc-lazy-build.XXXXXXXX)
+echo "$@" > ${BUILDDIR}/args
+BUILDSH=${BUILDDIR}/build.sh
+/usr/local/cuda/bin/nvcc --dryrun --keep --keep-dir=${BUILDDIR} "$@" 2>&1 | sed -e 's/#\$ //;/^rm/d' > $BUILDSH
+sed -i -e '/^\s*--/d' $BUILDSH
+sed -ne '1,/^cicc.*cudafe1.stub.c/p' ${BUILDSH} > ${BUILDSH}.pre
+sed -e '1,/^cicc.*cudafe1.stub.c/d' ${BUILDSH} > ${BUILDSH}.post
+
+sed -i -e '/LIBRARIES=/{s/\s//g;s/""/ /g}' ${BUILDSH}.pre
+
+/usr/bin/env bash ${BUILDSH}.pre
+STUBF=$(find $BUILDDIR -name *.cudafe1.stub.c)
+CUFILE=$(basename -s '.cudafe1.stub.c' $STUBF)
+sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' $STUBF
+sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' $STUBF
+# sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\"===> ${CUFILE} lazy-load? %d\\\\n\", l); __do____cudaRegisterAll();}" $STUBF
+sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}" $STUBF
+sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' $STUBF
+sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' $STUBF
+/usr/bin/env bash ${BUILDSH}.post
+rm -rf $BUILDDIR

From ba157929e5ff15b69b18aa19cc1ab71c8fdb64bf Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Fri, 27 May 2022 15:14:02 +0800
Subject: [PATCH 053/109] cast no need buffer (#42999)

---
 python/paddle/utils/code_gen/backward.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 81c211e640735..360425a30ccad 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -229,6 +229,7 @@
   kernel :
     func : cast_grad
     data_type : out_grad
+  no_need_buffer : x
 
 - backward_api : ceil_grad
   forward : ceil(Tensor x) -> Tensor(out)

From a76f2b33d287d5f7faec7b8fe08eb8d611dc7175 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Fri, 27 May 2022 16:02:59 +0800
Subject: [PATCH 054/109] Refine trunc uinttest logic  (#43016)

* refine trunc uinttest

* refine unittest

* refine ut

* refine fp64 grad check
---
 python/paddle/fluid/tests/unittests/test_trunc_op.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_trunc_op.py b/python/paddle/fluid/tests/unittests/test_trunc_op.py
index 5bb3e99ee302f..1a6790728b137 100644
--- a/python/paddle/fluid/tests/unittests/test_trunc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trunc_op.py
@@ -30,7 +30,7 @@ class TestTruncOp(OpTest):
     def setUp(self):
         self.op_type = "trunc"
         self.python_api = paddle.trunc
-        self.dtype = np.float64
+        self.init_dtype_type()
         np.random.seed(2021)
         self.inputs = {'X': np.random.random((20, 20)).astype(self.dtype)}
         self.outputs = {'Out': (np.trunc(self.inputs['X']))}
@@ -48,11 +48,19 @@ def test_check_grad(self):
 class TestFloatTruncOp(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.float32
+        self.__class__.exist_fp64_check_grad = True
+
+    def test_check_grad(self):
+        pass
 
 
 class TestIntTruncOp(TestTruncOp):
     def init_dtype_type(self):
         self.dtype = np.int32
+        self.__class__.exist_fp64_check_grad = True
+
+    def test_check_grad(self):
+        pass
 
 
 class TestTruncAPI(unittest.TestCase):

From 2d87300809ae75d76f5b0b457d8112cb88dc3e27 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 27 May 2022 18:07:09 +0800
Subject: [PATCH 055/109] [Dy2Stat]Replace paddle.jit.dy2stat with _jst
 (#42947)

* [Dy2Stat]Replace paddle.jit.dy2stat with _jst

* [Dy2Stat]Replace paddle.jit.dy2stat with _jst

* refine code style

* refine code style
---
 .../dygraph_to_static/assert_transformer.py   |  2 +-
 .../dygraph_to_static/call_transformer.py     |  2 +-
 .../dygraph_to_static/cast_transformer.py     |  4 +-
 .../dygraph_to_static/ifelse_transformer.py   |  2 +-
 .../dygraph_to_static/list_transformer.py     |  4 +-
 .../dygraph_to_static/logical_transformer.py  | 12 ++---
 .../dygraph_to_static/loop_transformer.py     |  2 +-
 .../dygraph_to_static/print_transformer.py    |  3 +-
 .../dygraph_to_static/return_transformer.py   |  4 +-
 .../tensor_shape_transformer.py               | 10 ++--
 .../fluid/dygraph/dygraph_to_static/utils.py  |  7 +--
 .../dygraph_to_static/variable_trans_func.py  |  6 +--
 .../dygraph_to_static/test_convert_call.py    |  5 +-
 .../dygraph_to_static/test_origin_info.py     |  8 +--
 .../test_program_translator.py                | 51 +++++++++----------
 .../dygraph_to_static/test_tensor_shape.py    |  6 ++-
 16 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index e2fcf4f2c2712..4d5076108cd31 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -37,7 +37,7 @@ def transform(self):
 
     def visit_Assert(self, node):
         convert_assert_node = gast.parse(
-            'paddle.jit.dy2static.convert_assert({test}, {msg})'.format(
+            '_jst.convert_assert({test}, {msg})'.format(
                 test=ast_to_source_code(node.test),
                 msg=ast_to_source_code(node.msg)
                 if node.msg else "")).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index a80dfa11402c5..c16d1ff17f707 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -71,7 +71,7 @@ def visit_Call(self, node):
         if PDB_SET in func_str:
             return node
 
-        new_func_str = "paddle.jit.dy2static.convert_call({})".format(func_str)
+        new_func_str = "_jst.convert_call({})".format(func_str)
         new_func_ast = gast.parse(new_func_str).body[0].value
         node.func = new_func_ast
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
index ef2d062d2d018..50733e4d896e4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -39,8 +39,8 @@ def visit_Call(self, node):
         func_str = ast_to_source_code(node.func).strip()
         if func_str in self._castable_type and len(node.args) > 0:
             args_str = ast_to_source_code(node.args[0]).strip()
-            new_func_str = "paddle.jit.dy2static.convert_var_dtype({}, '{}')".format(
-                args_str, func_str)
+            new_func_str = "_jst.convert_var_dtype({}, '{}')".format(args_str,
+                                                                     func_str)
             new_node = gast.parse(new_func_str).body[0].value
             return new_node
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 8fc5a691d212c..157822430d234 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -536,7 +536,7 @@ def create_name_nodes(name_ids):
     return_vars = create_name_nodes(return_name_ids)
 
     convert_ifelse_layer = gast.parse(
-        'paddle.jit.dy2static.convert_ifelse('
+        '_jst.convert_ifelse('
         '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args}, {return_vars})'.
         format(
             pred=ast_to_source_code(pred),
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
index e62def897d2eb..0951635162e5e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
@@ -129,7 +129,7 @@ def _transform_slice_to_tensor_write(self, node):
         elif slice_is_num(target_node):
             value_code = ast_to_source_code(node.value)
             i = "paddle.cast(" \
-                "x=paddle.jit.dy2static.to_static_variable({})," \
+                "x=_jst.to_static_variable({})," \
                 "dtype='int64')".format(ast_to_source_code(slice_node))
             assign_code = "{} = paddle.tensor.array_write(x={}, i={}, array={})" \
                 .format(target_name, value_code, i, target_name)
@@ -252,7 +252,7 @@ def _replace_pop(self, node):
         # 2. pop stmt for a list or dict if len(args_str) == 1
         # 3. pop stmt for a dict if len(args_str) == 2
         if len(args_str) <= 2:
-            new_pop_str = "paddle.jit.dy2static.convert_pop({}, {})"\
+            new_pop_str = "_jst.convert_pop({}, {})"\
                 .format(target_str, ",".join(args_str))
             new_pop_node = gast.parse(new_pop_str).body[0].value
             return new_pop_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
index e5c093f9a9255..bd573521f1b4e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
@@ -57,8 +57,7 @@ def visit_UnaryOp(self, node):
         self.generic_visit(node)
         if isinstance(node.op, gast.Not):
             arg = ast_to_source_code(node.operand)
-            new_node_str = "paddle.jit.dy2static.convert_logical_not({})".format(
-                arg)
+            new_node_str = "_jst.convert_logical_not({})".format(arg)
             # NOTE: gast.parse returns Module(body=[expr(value=...)])
             new_node = gast.parse(new_node_str).body[0].value
             return new_node
@@ -67,13 +66,12 @@ def visit_UnaryOp(self, node):
     def visit_Compare(self, node):
         self.generic_visit(node)
         left_str = ast_to_source_code(node.left).strip()
-        if left_str.startswith("paddle.jit.dy2static.convert_var_shape"):
+        if left_str.startswith("_jst.convert_var_shape"):
             # check left and comparators are all converted var shape
             compare_arg_strs = left_str
             for i, comparator in enumerate(node.comparators):
                 comparator_str = ast_to_source_code(comparator).strip()
-                if not comparator_str.startswith(
-                        "paddle.jit.dy2static.convert_var_shape"):
+                if not comparator_str.startswith("_jst.convert_var_shape"):
                     return node
                 op_str = cmpop_node_to_str(node.ops[i])
                 compare_arg_strs += (", '" + op_str + "', " + comparator_str)
@@ -81,7 +79,7 @@ def visit_Compare(self, node):
             # Now all left and comparators are converted shape
             # Replace some comparsion operation because of difference between
             # Python and Paddle
-            new_node_str = "paddle.jit.dy2static.convert_shape_compare({})".format(
+            new_node_str = "_jst.convert_shape_compare({})".format(
                 compare_arg_strs)
             new_node = gast.parse(new_node_str).body[0].value
             return new_node
@@ -119,7 +117,7 @@ def _create_bool_op_node(self, nodes, api_type):
             nodes = [pre_logic_node] + [post_logic_node]
 
         args = [ast_to_source_code(child) for child in nodes]
-        new_node_str = "paddle.jit.dy2static.convert_logical_{}(lambda:{}, lambda:{})".format(
+        new_node_str = "_jst.convert_logical_{}(lambda:{}, lambda:{})".format(
             api_type, args[0], args[1])
         # NOTE: gast.parse return Module(body=[expr(...)])
         new_node = gast.parse(new_node_str).body[0].value
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 4e5a3f7b70851..8014a00bff983 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -89,7 +89,7 @@ def create_while_nodes(condition_name, body_name, loop_var_names):
         else:
             assign_loop_var_names.append(name)
 
-    while_func_name = "paddle.jit.dy2static.convert_while_loop"
+    while_func_name = "_jst.convert_while_loop"
     while_node_str = "[{}] = {}({}, {}, [{}])".format(
         ",".join(assign_loop_var_names), while_func_name, condition_name,
         body_name, ",".join(loop_var_names))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index 7960617369e3f..f045d01c99bab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -50,6 +50,5 @@ def visit_Print(self, node):
         return gast.Expr(value=convert_print_node)
 
     def _create_print_node(self, print_args):
-        convert_print_func = gast.parse(
-            'paddle.jit.dy2static.convert_print').body[0].value
+        convert_print_func = gast.parse('_jst.convert_print').body[0].value
         return gast.Call(func=convert_print_func, args=print_args, keywords=[])
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 0c7a8bf421a12..8ac659dbead99 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -336,7 +336,7 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
         # Here assume that the parent node of return is gast.If
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = True'
-            node_str = "{} = paddle.jit.dy2static.create_bool_as_type({}, True)".format(
+            node_str = "{} = _jst.create_bool_as_type({}, True)".format(
                 return_name,
                 ast_to_source_code(parent_node_of_return.test).strip())
 
@@ -449,7 +449,7 @@ def _replace_after_node_to_if_in_stmt_list(
         # Here assume that the parent node of return is gast.If
         if isinstance(parent_node_of_return, gast.If):
             # Prepend control flow boolean nodes such as '__return@1 = False'
-            node_str = "{} = paddle.jit.dy2static.create_bool_as_type({}, False)".format(
+            node_str = "{} = _jst.create_bool_as_type({}, False)".format(
                 return_name,
                 ast_to_source_code(parent_node_of_return.test).strip())
             assign_false_node = gast.parse(node_str).body[0]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index 7733226cc09f2..d5b23d2f53b1c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -42,7 +42,7 @@ def create_convert_shape_node(var_shape_node,
         if slice_node is not None and slice_is_num(slice_node):
             args.append(ast_to_source_code(slice_node.slice).strip())
 
-        convert_var_shape_func = "paddle.jit.dy2static.convert_var_shape({}, in_control_flow={})".format(
+        convert_var_shape_func = "_jst.convert_var_shape({}, in_control_flow={})".format(
             ",".join(args), in_control_flow)
         api_shape_node = gast.parse(convert_var_shape_func).body[0].value
 
@@ -59,14 +59,14 @@ def create_convert_shape_node(var_shape_node,
 
 
 def create_choose_shape_node(attr_shape_name, api_shape_name, slice_node=None):
-    eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', globals())".format(
+    eval_exist_func = "_jst.eval_if_exist_else_none('{}', globals())".format(
         api_shape_name)
     args = [attr_shape_name, eval_exist_func]
 
     if slice_node is not None and slice_is_num(slice_node):
         args.append(ast_to_source_code(slice_node.slice).strip())
-    choose_shape_func = "paddle.jit.dy2static.choose_shape_attr_or_api({})".format(
-        ",".join(args))
+    choose_shape_func = "_jst.choose_shape_attr_or_api({})".format(",".join(
+        args))
     choose_shape_node = gast.parse(choose_shape_func).body[0].value
     if slice_node is not None and not slice_is_num(slice_node):
         return gast.Subscript(
@@ -84,7 +84,7 @@ class ShapeAttributeTransformer(gast.NodeTransformer):
     def visit_Attribute(self, node):
         if node.attr == 'shape':
             args = ast_to_source_code(node.value).strip()
-            convert_var_shape_func = "paddle.jit.dy2static.convert_var_shape_simple({})".format(
+            convert_var_shape_func = "_jst.convert_var_shape_simple({})".format(
                 args)
             api_shape_node = gast.parse(convert_var_shape_func).body[0].value
             return api_shape_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index d440e387da597..91c2c5dc65aab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -185,6 +185,7 @@ def is_api_in_module(node, module_prefix):
         import paddle.fluid as fluid
         import paddle.fluid.dygraph as dygraph
         import paddle.fluid.layers as layers
+        import paddle.jit.dy2static as _jst
 
         from paddle.fluid.dygraph import to_variable
         from paddle import to_tensor
@@ -521,8 +522,8 @@ def remove_if_exit(filepath):
 def _inject_import_statements():
     import_statements = [
         "import paddle", "from paddle import Tensor",
-        "import paddle.fluid as fluid", "from typing import *",
-        "import numpy as np"
+        "import paddle.fluid as fluid", "import paddle.jit.dy2static as _jst",
+        "from typing import *", "import numpy as np"
     ]
     return '\n'.join(import_statements) + '\n'
 
@@ -1168,7 +1169,7 @@ def _build_var_len_assign_node(self):
         else:
             iter_var_name = ast_to_source_code(self.iter_node).strip()
 
-        convert_len_node_source_str = '{} = paddle.jit.dy2static.convert_len({})'.format(
+        convert_len_node_source_str = '{} = _jst.convert_len({})'.format(
             self.iter_var_len_name, iter_var_name)
 
         convert_len_node = gast.parse(convert_len_node_source_str).body[0]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 2cd6c5e43f7e1..7ce5aede4995d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -77,14 +77,12 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
 
 
 def to_static_variable_gast_node(name):
-    func_code = "{} = paddle.jit.dy2static.to_static_variable({})".format(name,
-                                                                          name)
+    func_code = "{} = _jst.to_static_variable({})".format(name, name)
     return gast.parse(func_code).body[0]
 
 
 def create_static_variable_gast_node(name):
-    func_code = "{} = paddle.jit.dy2static\
-        .data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format(
+    func_code = "{} = _jst.data_layer_not_check(name='{}', shape=[-1], dtype='float32')".format(
         name, unique_name.generate(name))
     return gast.parse(func_code).body[0]
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index fb918f4ae00ed..2e2918facf896 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -24,6 +24,7 @@
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import CONVERSION_OPTIONS
 from test_program_translator import get_source_code
+import paddle.jit.dy2static as _jst
 
 program_translator = ProgramTranslator()
 
@@ -255,7 +256,7 @@ def _get_answer_code(self):
         return get_source_code(self.answer_func)
 
     def _get_transformed_code(self):
-        transformed_func = paddle.jit.dy2static.convert_call(self.func)
+        transformed_func = _jst.convert_call(self.func)
         return get_source_code(transformed_func)
 
     def test_code(self):
@@ -275,7 +276,7 @@ def set_func(self):
     def set_answer_func(self):
         class StaticCode():
             def func_convert_then_not_to_static(x):
-                y = paddle.jit.dy2static.convert_call(func_not_to_static)(x)
+                y = _jst.convert_call(func_not_to_static)(x)
                 return y
 
         self.answer_func = StaticCode.func_convert_then_not_to_static
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index e3d34184a38fc..8dac888993590 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -65,7 +65,7 @@ def set_test_func(self):
         self.func = simple_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7, 8]
+        self.static_abs_lineno_list = [7, 8, 9]
 
     def set_dygraph_info(self):
         self.line_num = 3
@@ -149,7 +149,7 @@ def set_test_func(self):
         self.func = nested_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 8, 9, 10, 11]
+        self.static_abs_lineno_list = [7, 9, 10, 11, 12]
 
     def set_dygraph_info(self):
         self.line_num = 5
@@ -174,7 +174,7 @@ def set_test_func(self):
         self.func = decorated_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7]
+        self.static_abs_lineno_list = [7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 2
@@ -208,7 +208,7 @@ def set_test_func(self):
         self.func = decorated_func2
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [6, 7]
+        self.static_abs_lineno_list = [7, 8]
 
     def set_dygraph_info(self):
         self.line_num = 2
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index b0ffbac88fb42..4e90c73baa944 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -27,6 +27,7 @@
 from paddle.fluid.dygraph.jit import declarative
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+import paddle.jit.dy2static as _jst
 
 from ifelse_simple_func import dyfunc_with_if_else
 
@@ -76,40 +77,38 @@ def false_fn_0(x_v):
             x_v = x_v + 1
             return x_v
 
-        x_v = paddle.jit.dy2static.convert_ifelse(
+        x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ),
             (x_v, ), (x_v, ))
-        __return_0 = paddle.jit.dy2static.create_bool_as_type(label is not None,
-                                                              False)
+        __return_0 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_1(__return_0, __return_value_0, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_0 = paddle.jit.dy2static.create_bool_as_type(
-                label is not None, True)
+            __return_0 = _jst.create_bool_as_type(label is not None, True)
             __return_value_0 = loss
             return __return_0, __return_value_0
 
         def false_fn_1(__return_0, __return_value_0):
             return __return_0, __return_value_0
 
-        __return_0, __return_value_0 = (paddle.jit.dy2static.convert_ifelse(
+        __return_0, __return_value_0 = _jst.convert_ifelse(
             label is not None, true_fn_1, false_fn_1,
             (__return_0, __return_value_0, label, x_v),
-            (__return_0, __return_value_0), (__return_0, __return_value_0)))
+            (__return_0, __return_value_0), (__return_0, __return_value_0))
 
         def true_fn_2(__return_0, __return_value_0, x_v):
-            __return_1 = paddle.jit.dy2static.create_bool_as_type(
-                paddle.jit.dy2static.convert_logical_not(__return_0), True)
+            __return_1 = _jst.create_bool_as_type(
+                _jst.convert_logical_not(__return_0), True)
             __return_value_0 = x_v
             return __return_value_0
 
         def false_fn_2(__return_value_0):
             return __return_value_0
 
-        __return_value_0 = paddle.jit.dy2static.convert_ifelse(
-            paddle.jit.dy2static.convert_logical_not(__return_0), true_fn_2,
-            false_fn_2, (__return_0, __return_value_0,
-                         x_v), (__return_value_0, ), (__return_value_0, ))
+        __return_value_0 = _jst.convert_ifelse(
+            _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2,
+            (__return_0, __return_value_0,
+             x_v), (__return_value_0, ), (__return_value_0, ))
         return __return_value_0
 
 
@@ -128,40 +127,38 @@ def false_fn_3(x_v):
             x_v = x_v + 1
             return x_v
 
-        x_v = paddle.jit.dy2static.convert_ifelse(
+        x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ),
             (x_v, ), (x_v, ))
-        __return_2 = paddle.jit.dy2static.create_bool_as_type(label is not None,
-                                                              False)
+        __return_2 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_4(__return_2, __return_value_1, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            __return_2 = paddle.jit.dy2static.create_bool_as_type(
-                label is not None, True)
+            __return_2 = _jst.create_bool_as_type(label is not None, True)
             __return_value_1 = loss
             return __return_2, __return_value_1
 
         def false_fn_4(__return_2, __return_value_1):
             return __return_2, __return_value_1
 
-        __return_2, __return_value_1 = paddle.jit.dy2static.convert_ifelse(
-            label is not None, true_fn_4, false_fn_4, (
-                __return_2, __return_value_1, label, x_v),
+        __return_2, __return_value_1 = _jst.convert_ifelse(
+            label is not None, true_fn_4, false_fn_4,
+            (__return_2, __return_value_1, label, x_v),
             (__return_2, __return_value_1), (__return_2, __return_value_1))
 
         def true_fn_5(__return_2, __return_value_1, x_v):
-            __return_3 = paddle.jit.dy2static.create_bool_as_type(
-                paddle.jit.dy2static.convert_logical_not(__return_2), True)
+            __return_3 = _jst.create_bool_as_type(
+                _jst.convert_logical_not(__return_2), True)
             __return_value_1 = x_v
             return __return_value_1
 
         def false_fn_5(__return_value_1):
             return __return_value_1
 
-        __return_value_1 = paddle.jit.dy2static.convert_ifelse(
-            paddle.jit.dy2static.convert_logical_not(__return_2), true_fn_5,
-            false_fn_5, (__return_2, __return_value_1,
-                         x_v), (__return_value_1, ), (__return_value_1, ))
+        __return_value_1 = _jst.convert_ifelse(
+            _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5,
+            (__return_2, __return_value_1,
+             x_v), (__return_value_1, ), (__return_value_1, ))
         return __return_value_1
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index d05be03bbfb19..5cf9d7749c358 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -597,9 +597,11 @@ def test(self):
 class TestPaddleShape(unittest.TestCase):
     def test_paddle_shape(self):
         func = paddle.jit.to_static(dyfunc_len_paddle_shape)
-        self.assertEqual('paddle.shape(x)' in func.code, True)
+        func_code = func.code.replace("\n", "").replace(" ", "")
+        self.assertEqual('paddle.shape(x)' in func_code, True)
         func = paddle.jit.to_static(dyfunc_dict_assign_shape)
-        self.assertEqual("__static_convert_var_shape_suffix" in func.code, True)
+        func_code = func.code.replace("\n", "").replace(" ", "")
+        self.assertEqual("__static_convert_var_shape_suffix" in func_code, True)
 
 
 if __name__ == '__main__':

From 6d78524c27732fdc4f3505815d392d8f24b2dca8 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 27 May 2022 20:47:18 +0800
Subject: [PATCH 056/109] [Phi] Change optional tensor from `optional<const
 Tensor&>` to `optional<Tensor>` (#42939)

* refactor the optional tensor

* remove optiona<MetaTensor> in InferMeta

* fix bug

* fix optional<vector<Tensor>>

* fix bug

* fix rmsprop

* fix amp of eager_gen

* polish code

* fix deleted code

* fix merge conflict

* polish code

* remove is_nullopt_

* fix merge conflict

* fix merge conflict
---
 .../final_state_generator/eager_gen.py        |  21 +--
 paddle/fluid/eager/eager_amp_auto_cast.h      |  14 +-
 paddle/fluid/eager/utils.cc                   |   2 +-
 paddle/fluid/eager/utils.h                    |   2 +-
 paddle/fluid/framework/infershape_utils.cc    |  16 +-
 paddle/fluid/framework/infershape_utils.h     |  12 +-
 paddle/fluid/framework/operator.cc            |  11 +-
 paddle/fluid/imperative/prepared_operator.h   |   8 +-
 .../operators/fused/fused_dropout_test.h      |   4 +-
 paddle/fluid/operators/inplace_abn_op.cc      |   6 +-
 paddle/fluid/operators/inplace_abn_op.cu      |   6 +-
 .../operators/optimizers/dgc_momentum_op.h    |   9 +-
 paddle/fluid/pybind/eager_utils.cc            |   4 +-
 paddle/fluid/pybind/eager_utils.h             |   2 +-
 paddle/phi/api/lib/api_custom_impl.cc         | 165 ++++--------------
 paddle/phi/api/lib/api_custom_impl.h          |  12 +-
 paddle/phi/api/lib/api_gen_utils.cc           |  24 +--
 paddle/phi/api/lib/api_gen_utils.h            |  14 +-
 paddle/phi/api/lib/data_transform.cc          |  17 +-
 paddle/phi/api/lib/data_transform.h           |   7 +-
 paddle/phi/api/lib/kernel_dispatch.h          |   4 +-
 paddle/phi/core/infermeta_utils.cc            |  14 +-
 paddle/phi/core/infermeta_utils.h             |  25 +--
 paddle/phi/core/kernel_context.h              |  16 +-
 paddle/phi/core/kernel_registry.h             |  10 +-
 paddle/phi/core/kernel_utils.h                |   6 +-
 paddle/phi/core/meta_tensor.h                 |  16 +-
 paddle/phi/infermeta/backward.cc              |  35 ++--
 paddle/phi/infermeta/backward.h               |  33 ++--
 paddle/phi/infermeta/binary.cc                |  16 +-
 paddle/phi/infermeta/binary.h                 |   6 +-
 paddle/phi/infermeta/multiary.cc              | 125 +++++++------
 paddle/phi/infermeta/multiary.h               |  67 ++++---
 paddle/phi/infermeta/ternary.cc               |  68 ++++----
 paddle/phi/infermeta/ternary.h                |  18 +-
 paddle/phi/kernels/activation_grad_kernel.h   |   2 +-
 paddle/phi/kernels/adam_kernel.h              |   4 +-
 paddle/phi/kernels/adamw_kernel.h             |   4 +-
 paddle/phi/kernels/assign_kernel.cc           |   2 +-
 paddle/phi/kernels/assign_kernel.h            |   2 +-
 paddle/phi/kernels/batch_norm_grad_kernel.h   |  16 +-
 .../kernels/bilinear_tensor_product_kernel.h  |   2 +-
 paddle/phi/kernels/bincount_kernel.h          |   2 +-
 paddle/phi/kernels/conv_grad_grad_kernel.h    |   8 +-
 paddle/phi/kernels/cpu/adam_kernel.cc         |   4 +-
 paddle/phi/kernels/cpu/adamw_kernel.cc        |   4 +-
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |  16 +-
 paddle/phi/kernels/cpu/bincount_kernel.cc     |   4 +-
 .../phi/kernels/cpu/conv_grad_grad_kernel.cc  |   4 +-
 paddle/phi/kernels/cpu/dropout_kernel.cc      |   2 +-
 .../cpu/elementwise_add_grad_kernel.cc        |   4 +-
 .../cpu/elementwise_subtract_grad_kernel.cc   |   4 +-
 .../phi/kernels/cpu/graph_reindex_kernel.cc   |   4 +-
 .../cpu/graph_sample_neighbors_kernel.cc      |   4 +-
 .../cpu/graph_send_recv_grad_kernel.cc        |   4 +-
 .../kernels/cpu/hierarchical_sigmoid_grad.h   |   6 +-
 .../cpu/hierarchical_sigmoid_grad_kernel.cc   |   6 +-
 .../cpu/hierarchical_sigmoid_kernel.cc        |   6 +-
 .../kernels/cpu/instance_norm_grad_kernel.cc  |  10 +-
 .../phi/kernels/cpu/instance_norm_kernel.cc   |   4 +-
 .../kernels/cpu/interpolate_grad_kernel.cc    |  54 +++---
 paddle/phi/kernels/cpu/interpolate_kernel.cc  |  54 +++---
 paddle/phi/kernels/cpu/label_smooth_kernel.cc |   2 +-
 .../phi/kernels/cpu/layer_norm_grad_kernel.cc |   4 +-
 paddle/phi/kernels/cpu/layer_norm_kernel.cc   |   4 +-
 .../phi/kernels/cpu/nll_loss_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/nll_loss_kernel.cc     |   2 +-
 .../phi/kernels/cpu/psroi_pool_grad_kernel.cc |   2 +-
 paddle/phi/kernels/cpu/psroi_pool_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/rnn_grad_kernel.cc     |   4 +-
 paddle/phi/kernels/cpu/rnn_kernel.cc          |   2 +-
 .../phi/kernels/cpu/roi_align_grad_kernel.cc  |   2 +-
 paddle/phi/kernels/cpu/roi_align_kernel.cc    |   2 +-
 .../phi/kernels/cpu/roi_pool_grad_kernel.cc   |   2 +-
 paddle/phi/kernels/cpu/roi_pool_kernel.cc     |   2 +-
 paddle/phi/kernels/cpu/sgd_kernel.cc          |   6 +-
 .../kernels/cpu/yolov3_loss_grad_kernel.cc    |   2 +-
 paddle/phi/kernels/cpu/yolov3_loss_kernel.cc  |   2 +-
 .../phi/kernels/deformable_conv_grad_kernel.h |   2 +-
 paddle/phi/kernels/deformable_conv_kernel.h   |   2 +-
 paddle/phi/kernels/dropout_kernel.h           |   2 +-
 .../phi/kernels/elementwise_add_grad_kernel.h |   4 +-
 .../kernels/elementwise_divide_grad_kernel.h  |   4 +-
 .../elementwise_multiply_grad_kernel.h        |  10 +-
 .../elementwise_subtract_grad_kernel.h        |   4 +-
 paddle/phi/kernels/expand_as_kernel.h         |   2 +-
 paddle/phi/kernels/funcs/pooling.cu           |   2 +-
 paddle/phi/kernels/funcs/segment_pooling.cc   |   2 +-
 paddle/phi/kernels/funcs/segment_pooling.cu   |   2 +-
 paddle/phi/kernels/funcs/segment_pooling.h    |   2 +-
 paddle/phi/kernels/gpu/adam_kernel.cu         |   4 +-
 paddle/phi/kernels/gpu/adamw_kernel.cu        |   4 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |  16 +-
 paddle/phi/kernels/gpu/bincount_kernel.cu     |   4 +-
 paddle/phi/kernels/gpu/dropout_kernel.cu      |   2 +-
 .../gpu/elementwise_add_grad_kernel.cu        |   4 +-
 .../gpu/elementwise_subtract_grad_kernel.cu   |   4 +-
 .../phi/kernels/gpu/graph_reindex_kernel.cu   |   4 +-
 .../gpu/graph_sample_neighbors_kernel.cu      |   4 +-
 .../gpu/graph_send_recv_grad_kernel.cu        |   4 +-
 .../kernels/gpu/instance_norm_grad_kernel.cu  |  10 +-
 .../phi/kernels/gpu/instance_norm_kernel.cu   |   4 +-
 .../kernels/gpu/interpolate_grad_kernel.cu    |  54 +++---
 paddle/phi/kernels/gpu/interpolate_kernel.cu  |  54 +++---
 paddle/phi/kernels/gpu/label_smooth_kernel.cu |   2 +-
 .../phi/kernels/gpu/layer_norm_grad_kernel.cu |   4 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |   4 +-
 .../phi/kernels/gpu/nll_loss_grad_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/nll_loss_kernel.cu     |   2 +-
 .../phi/kernels/gpu/psroi_pool_grad_kernel.cu |   2 +-
 paddle/phi/kernels/gpu/psroi_pool_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc  |   2 +-
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       |   2 +-
 .../phi/kernels/gpu/roi_align_grad_kernel.cu  |   2 +-
 paddle/phi/kernels/gpu/roi_align_kernel.cu    |   2 +-
 .../phi/kernels/gpu/roi_pool_grad_kernel.cu   |   2 +-
 paddle/phi/kernels/gpu/roi_pool_kernel.cu     |   2 +-
 paddle/phi/kernels/gpu/sgd_kernel.cu          |   6 +-
 .../kernels/gpudnn/conv_grad_grad_kernel.cu   |  12 +-
 paddle/phi/kernels/graph_reindex_kernel.h     |   4 +-
 .../kernels/graph_sample_neighbors_kernel.h   |   4 +-
 .../phi/kernels/graph_send_recv_grad_kernel.h |   4 +-
 .../hierarchical_sigmoid_grad_kernel.h        |   6 +-
 .../phi/kernels/hierarchical_sigmoid_kernel.h |   6 +-
 .../phi/kernels/impl/activation_grad_impl.h   |   2 +-
 .../bilinear_tensor_product_kernel_impl.h     |   2 +-
 .../kernels/impl/conv_grad_grad_kernel_impl.h |   4 +-
 .../impl/deformable_conv_grad_kernel_impl.h   |   2 +-
 .../impl/deformable_conv_kernel_impl.h        |   2 +-
 .../impl/elementwise_grad_kernel_impl.h       |  22 +--
 .../phi/kernels/impl/expand_as_kernel_impl.h  |   2 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |  14 +-
 .../phi/kernels/impl/momentum_kernel_impl.h   |   8 +-
 paddle/phi/kernels/impl/rmsprop_kernel_impl.h |  57 ++++--
 .../impl/segment_pool_grad_kernel_impl.h      |   2 +-
 .../kernels/impl/warpctc_grad_kernel_impl.h   |   2 +-
 paddle/phi/kernels/impl/warpctc_kernel_impl.h |   4 +-
 .../phi/kernels/instance_norm_grad_kernel.h   |  10 +-
 paddle/phi/kernels/instance_norm_kernel.h     |   4 +-
 paddle/phi/kernels/interpolate_grad_kernel.h  |   6 +-
 paddle/phi/kernels/interpolate_kernel.h       |  34 ++--
 paddle/phi/kernels/label_smooth_kernel.h      |   2 +-
 paddle/phi/kernels/layer_norm_grad_kernel.h   |   4 +-
 paddle/phi/kernels/layer_norm_kernel.h        |   4 +-
 paddle/phi/kernels/matmul_grad_kernel.h       |  14 +-
 paddle/phi/kernels/momentum_kernel.h          |   4 +-
 paddle/phi/kernels/nll_loss_grad_kernel.h     |   2 +-
 paddle/phi/kernels/nll_loss_kernel.cc         |   2 +-
 paddle/phi/kernels/nll_loss_kernel.h          |   2 +-
 paddle/phi/kernels/psroi_pool_grad_kernel.h   |   2 +-
 paddle/phi/kernels/psroi_pool_kernel.h        |   2 +-
 paddle/phi/kernels/rmsprop_kernel.h           |   4 +-
 paddle/phi/kernels/rnn_grad_kernel.h          |   2 +-
 paddle/phi/kernels/rnn_kernel.h               |   2 +-
 paddle/phi/kernels/roi_align_grad_kernel.h    |   2 +-
 paddle/phi/kernels/roi_align_kernel.h         |   2 +-
 paddle/phi/kernels/roi_pool_grad_kernel.h     |   2 +-
 paddle/phi/kernels/roi_pool_kernel.h          |   2 +-
 paddle/phi/kernels/segment_pool_grad_kernel.h |   2 +-
 .../phi/kernels/selected_rows/adam_kernel.h   |   4 +-
 .../phi/kernels/selected_rows/adamw_kernel.h  |   4 +-
 .../kernels/selected_rows/assign_kernel.cc    |   2 +-
 .../kernels/selected_rows/cpu/adam_kernel.cc  |   4 +-
 .../kernels/selected_rows/cpu/adamw_kernel.cc |   4 +-
 .../kernels/selected_rows/gpu/adam_kernel.cu  |   4 +-
 .../kernels/selected_rows/gpu/adamw_kernel.cu |   4 +-
 .../hierarchical_sigmoid_grad_kernel.cc       |   6 +-
 .../hierarchical_sigmoid_grad_kernel.h        |   6 +-
 paddle/phi/kernels/sgd_kernel.h               |   6 +-
 paddle/phi/kernels/warpctc_grad_kernel.h      |   2 +-
 paddle/phi/kernels/warpctc_kernel.h           |   4 +-
 paddle/phi/kernels/yolov3_loss_grad_kernel.h  |   2 +-
 paddle/phi/kernels/yolov3_loss_kernel.h       |   2 +-
 python/paddle/utils/code_gen/api_base.py      |  27 +--
 python/paddle/utils/code_gen/type_mapping.py  |   2 +-
 .../utils/code_gen/wrapped_infermeta_gen.py   |   5 +-
 176 files changed, 785 insertions(+), 911 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 403216813dd36..d8b909c3bacc1 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -345,14 +345,14 @@ class {} : public egr::GradNodeBase {{
 
 CREATE_PLAIN_OPTIONAL_TENSOR_TEMPLATE = \
 """
-  paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
-  if({}.initialized()) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+  paddle::optional<paddle::experimental::Tensor> {}_optional;
+  if({}.initialized()) {}_optional = paddle::make_optional<paddle::experimental::Tensor>({});
 """
 
 CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE = \
 """
-  paddle::optional<const paddle::experimental::Tensor&> {}_optional = paddle::none;
-  if( {}.impl() ) {}_optional = paddle::make_optional<const paddle::experimental::Tensor&>({});
+  paddle::optional<paddle::experimental::Tensor> {}_optional;
+  if( {}.impl() ) {}_optional = paddle::make_optional<paddle::experimental::Tensor>({});
 """
 
 CHECK_BACKWARD_INPLACE_TEMPLATE = \
@@ -713,7 +713,7 @@ def GenerateNodeCreationCodes(self):
 
             if is_fwd_input:
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
+                    set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
                 else:
                     set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
@@ -724,7 +724,7 @@ def GenerateNodeCreationCodes(self):
                     ), AssertMessage(name, forward_outputs_position_map.keys())
 
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
+                    set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
                 else:
                     set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_output_tensor_wrappers_list.append(set_tensor_wrappers)
@@ -888,15 +888,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             is_optional = (name in optional_inputs)
             if IsPlainTensorType(ttype):
                 if is_optional:
-                    arg_str = f"const paddle::optional<const paddle::experimental::Tensor&> {name}"
+                    arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
                     amp_tensors_vector_optional_list.append(
-                        f"if ({name}.get_ptr() != nullptr) amp_tensors_vector.push_back({{ *({name}.get_ptr()) }});\n"
+                        f"if ({name}) amp_tensors_vector.push_back({{ *{name} }});\n"
                     )
                     amp_autocast_optional_list.append(
-                        f"auto NEW_{name}_temp_tensor = ({name}.get_ptr() != nullptr) ? egr::EagerAmpAutoCast(\"{name}\", *({name}.get_ptr()), amp_dst_dtype, op_name) : paddle::experimental::Tensor();\n"
-                    )
-                    amp_autocast_optional_list.append(
-                        f"auto NEW_{name} = ({name}.get_ptr() != nullptr) ? paddle::make_optional<const paddle::experimental::Tensor&>(NEW_{name}_temp_tensor) : {name};\n"
+                        f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                     )
                 else:
                     if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index ee9da41881b2d..c3fe3551ccb28 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -60,7 +60,8 @@ inline std::vector<paddle::experimental::Tensor> EagerAmpAutoCasts(
 
 inline paddle::experimental::Tensor EagerAmpAutoCast(
     const std::string& input_name, const paddle::experimental::Tensor& input,
-    const paddle::experimental::DataType& dst_dtype, std::string op_name) {
+    const paddle::experimental::DataType& dst_dtype,
+    const std::string& op_name) {
   VLOG(6) << "AMP AmpAutoCasts:"
           << " input(" << input_name << ") dst_dtype("
           << paddle::framework::DataType2String(dst_dtype) << ").";
@@ -87,4 +88,15 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
   return input;
 }
 
+inline paddle::optional<paddle::experimental::Tensor> EagerAmpAutoCast(
+    const std::string& input_name,
+    const paddle::optional<paddle::experimental::Tensor>& input,
+    const paddle::experimental::DataType& dst_dtype,
+    const std::string& op_name) {
+  if (input) {
+    return EagerAmpAutoCast(input_name, *input, dst_dtype, op_name);
+  }
+  return paddle::none;
+}
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index d22f4316d5604..9ccd91ca65733 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -73,7 +73,7 @@ AutogradMeta* EagerUtils::nullable_autograd_meta(
 }
 
 AutogradMeta* EagerUtils::nullable_autograd_meta(
-    paddle::optional<const paddle::experimental::Tensor&> target) {
+    const paddle::optional<paddle::experimental::Tensor>& target) {
   if (target.get_ptr() != nullptr) {
     return EagerUtils::nullable_autograd_meta(*(target.get_ptr()));
   }
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 7f5864ec887ca..63baebca53c37 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -125,7 +125,7 @@ class EagerUtils {
   static AutogradMeta* nullable_autograd_meta(
       const paddle::experimental::Tensor& target);
   static AutogradMeta* nullable_autograd_meta(
-      paddle::optional<const paddle::experimental::Tensor&> target);
+      const paddle::optional<paddle::experimental::Tensor>& target);
   static std::vector<AutogradMeta*> nullable_autograd_meta(
       const std::vector<paddle::experimental::Tensor>& targets);
   static std::vector<AutogradMeta*> nullable_autograd_meta(
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 2a8ffbf431ecd..d7901a83b8502 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -349,14 +349,6 @@ const phi::MetaTensor& CompatInferMetaContext::InputAt(size_t idx) const {
   return compat_inputs_.at(idx);
 }
 
-paddle::optional<const phi::MetaTensor&>
-CompatInferMetaContext::OptionalInputAt(size_t idx) const {
-  const auto& input = compat_inputs_.at(idx);
-  return input.initialized()
-             ? paddle::optional<const phi::MetaTensor&>{input}
-             : paddle::optional<const phi::MetaTensor&>{paddle::none};
-}
-
 std::vector<const phi::MetaTensor*> CompatInferMetaContext::InputsBetween(
     size_t start, size_t end) const {
   std::vector<const phi::MetaTensor*> result;
@@ -370,7 +362,7 @@ std::vector<const phi::MetaTensor*> CompatInferMetaContext::InputsBetween(
   return result;
 }
 
-paddle::optional<const std::vector<const phi::MetaTensor*>>
+paddle::optional<std::vector<const phi::MetaTensor*>>
 CompatInferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
   const auto& first = compat_inputs_.at(start);
 
@@ -383,10 +375,10 @@ CompatInferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
       result.emplace_back(in.initialized() ? &in : nullptr);
     }
 
-    return paddle::optional<const std::vector<const phi::MetaTensor*>>(result);
+    return paddle::optional<std::vector<const phi::MetaTensor*>>(
+        std::move(result));
   }
-  return paddle::optional<const std::vector<const phi::MetaTensor*>>(
-      paddle::none);
+  return paddle::none;
 }
 
 phi::MetaTensor* CompatInferMetaContext::MutableOutputAt(size_t idx) {
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 855e873b30951..04ac1ff59f7ee 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -59,6 +59,12 @@ class CompatMetaTensor : public phi::MetaTensor {
 
   bool initialized() const override { return initialized_; };
 
+  operator unspecified_bool_type() const override {
+    return initialized_ ? unspecified_bool_true : 0;
+  }
+
+  bool operator!() const override { return !initialized_; }
+
  private:
   const LoD& GetRuntimeLoD() const {
     auto* var = BOOST_GET_CONST(Variable*, var_);
@@ -107,13 +113,11 @@ class CompatInferMetaContext : public phi::InferMetaContext {
           outputs);
 
   const phi::MetaTensor& InputAt(size_t idx) const override;
-  paddle::optional<const phi::MetaTensor&> OptionalInputAt(
-      size_t idx) const override;
 
   std::vector<const phi::MetaTensor*> InputsBetween(size_t start,
                                                     size_t end) const override;
-  paddle::optional<const std::vector<const phi::MetaTensor*>>
-  OptionalInputsBetween(size_t start, size_t end) const override;
+  paddle::optional<std::vector<const phi::MetaTensor*>> OptionalInputsBetween(
+      size_t start, size_t end) const override;
 
   phi::MetaTensor* MutableOutputAt(size_t idx) override;
   std::vector<phi::MetaTensor*> MutableOutputBetween(size_t start,
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d8eab0e9a7297..afd1bf338c45e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2370,15 +2370,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
-             std::type_index(
-                 typeid(paddle::optional<const phi::DenseTensor&>)) ||
+             std::type_index(typeid(paddle::optional<phi::DenseTensor>)) ||
          input_defs[i].type_index ==
-             std::type_index(
-                 typeid(paddle::optional<const phi::SelectedRows&>)) ||
+             std::type_index(typeid(paddle::optional<phi::SelectedRows>)) ||
          input_defs[i].type_index ==
-             std::type_index(
-                 typeid(paddle::optional<
-                        const std::vector<const phi::DenseTensor*>>)))) {
+             std::type_index(typeid(
+                 paddle::optional<std::vector<const phi::DenseTensor*>>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 129f75e75de1e..ccc8d64517f95 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -279,16 +279,14 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
 
     if (it == ins.end()) {
       if (LIKELY(input_defs[i].type_index ==
-                 std::type_index(
-                     typeid(paddle::optional<const phi::DenseTensor&>)))) {
+                 std::type_index(typeid(paddle::optional<phi::DenseTensor>)))) {
         kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
         auto end_idx = start_idx + 1;
         kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
         continue;
       } else if (input_defs[i].type_index ==
-                 std::type_index(
-                     typeid(paddle::optional<
-                            const std::vector<const phi::DenseTensor*>>))) {
+                 std::type_index(typeid(
+                     paddle::optional<std::vector<const phi::DenseTensor*>>))) {
         kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr);
         auto end_idx = start_idx + 1;
         kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index a9b72a9cdf397..8527610247b05 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -138,7 +138,7 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const platform::CUDADeviceContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
-  paddle::optional<const framework::LoDTensor &> scale_opt = paddle::none;
+  paddle::optional<framework::LoDTensor> scale_opt;
   if (scale.size() > 0) {
     auto var_scale = scope.Var("Scale");
     auto tensor_scale = var_scale->GetMutable<framework::LoDTensor>();
@@ -147,7 +147,7 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
     scale_opt = *tensor_scale;
   }
 
-  paddle::optional<const framework::LoDTensor &> bias_opt = paddle::none;
+  paddle::optional<framework::LoDTensor> bias_opt;
   if (bias.size() > 0) {
     auto var_bias = scope.Var("Bias");
     auto tensor_bias = var_bias->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 344b104b5948c..d420d0319bfe4 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -292,9 +292,9 @@ class InplaceABNGradKernel : public framework::OpKernel<T> {
     auto* mean = ctx.Input<Tensor>("ReserveSpace");
     auto* variance = ctx.Input<Tensor>("ReserveSpace");
 
-    paddle::optional<const Tensor&> space_opt = paddle::none;
-    paddle::optional<const Tensor&> mean_opt = paddle::none;
-    paddle::optional<const Tensor&> variance_opt = paddle::none;
+    paddle::optional<Tensor> space_opt;
+    paddle::optional<Tensor> mean_opt;
+    paddle::optional<Tensor> variance_opt;
 
     if (reserve_space != nullptr) {
       space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index 6c16210ced022..6476023fcd20e 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -120,9 +120,9 @@ class InplaceABNGradKernel
       auto* mean = ctx.Input<Tensor>("ReserveSpace");
       auto* variance = ctx.Input<Tensor>("ReserveSpace");
 
-      paddle::optional<const Tensor&> space_opt = paddle::none;
-      paddle::optional<const Tensor&> mean_opt = paddle::none;
-      paddle::optional<const Tensor&> variance_opt = paddle::none;
+      paddle::optional<Tensor> space_opt;
+      paddle::optional<Tensor> mean_opt;
+      paddle::optional<Tensor> variance_opt;
 
       if (reserve_space != nullptr) {
         space_opt = *reserve_space;
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index fc954e60a8c3e..9d6ecf414e664 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -72,8 +72,7 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       auto* velocity_out = context.Output<framework::Tensor>("VelocityOut");
       auto* master_param_out =
           context.Output<framework::Tensor>("MasterParamOut");
-      paddle::optional<const framework::Tensor&> master_param_opt =
-          paddle::none;
+      paddle::optional<framework::Tensor> master_param_opt(paddle::none);
       float mu = context.Attr<float>("mu");
       bool use_nesterov = context.Attr<bool>("use_nesterov");
       std::string regularization_method =
@@ -117,8 +116,7 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       auto* param_out = context.Output<framework::Tensor>("ParamOut");
       auto* master_param_out =
           context.Output<framework::Tensor>("MasterParamOut");
-      paddle::optional<const framework::Tensor&> master_param_opt =
-          paddle::none;
+      paddle::optional<framework::Tensor> master_param_opt(paddle::none);
       if (multi_precision) {
         auto* master_param = context.Input<framework::Tensor>("MasterParam");
         master_param_opt = *master_param;
@@ -149,8 +147,7 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
       auto* param_out = context.Output<phi::SelectedRows>("ParamOut");
       auto* master_param_out =
           context.Output<phi::SelectedRows>("MasterParamOut");
-      paddle::optional<const phi::SelectedRows&> master_param_opt =
-          paddle::none;
+      paddle::optional<phi::SelectedRows> master_param_opt(paddle::none);
       if (multi_precision) {
         auto* master_param = context.Input<phi::SelectedRows>("MasterParam");
         master_param_opt = *master_param;
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 4707f757d8bfb..efa0fe2cb582e 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -765,7 +765,7 @@ PyObject* ToPyObject(const std::unordered_map<std::wstring, int>& value) {
 
 // For Final State Dygraph,
 // We directly use paddle::optional(Tensor) as dispensable Tensor
-paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable) {
   PyObject* obj = PyTuple_GET_ITEM(args, arg_idx);
@@ -784,7 +784,7 @@ paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
   }
 
   if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
-    return paddle::make_optional<const paddle::experimental::Tensor&>(
+    return paddle::make_optional<paddle::experimental::Tensor>(
         reinterpret_cast<TensorObject*>(obj)->tensor);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index c8e1cd4ad0b75..7f94f6c90e5a0 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -185,7 +185,7 @@ paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
 paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
                                     ssize_t arg_pos);
 
-paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
+paddle::optional<paddle::experimental::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 8a845c331cc60..b6431fcbe690e 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -41,8 +41,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -87,11 +87,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
   auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
   auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {});
   auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {});
-  paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
-  auto input_master_param_ptr =
-      PrepareData(master_param, kernel.InputAt(7), {});
-  paddle::optional<const phi::DenseTensor&> input_skip_update(paddle::none);
-  auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {});
+  auto input_master_param = PrepareData(master_param, kernel.InputAt(7), {});
+  auto input_skip_update = PrepareData(skip_update, kernel.InputAt(8), {});
 
   std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> api_output;
   auto kernel_out_0 = input_param.get();
@@ -100,40 +97,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
   auto kernel_out_3 = input_beta1_pow.get();
   auto kernel_out_4 = input_beta2_pow.get();
   phi::DenseTensor* kernel_out_5 = nullptr;
-  if (input_master_param_ptr) {
-    input_master_param =
-        paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
-    kernel_out_5 =
-        paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
-            .get_ptr();
+  if (input_master_param) {
+    kernel_out_5 = input_master_param.get_ptr();
   }
 
-  if (input_skip_update_ptr) {
-    input_skip_update =
-        paddle::make_optional<const phi::DenseTensor&>(*input_skip_update_ptr);
-  }
-
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
-      paddle::none);
-  phi::DenseTensor dt;
-  phi::MetaTensor input_meta_tmp_master_param(dt);
-  if (input_master_param_ptr) {
-    input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
-    input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
-    input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
-    input_meta_ref_master_param = input_meta_tmp_master_param;
-  }
+  auto input_meta_ref_master_param = MakeMetaTensor(input_master_param);
 
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_skip_update(
-      paddle::none);
-  phi::DenseTensor dt1;
-  phi::MetaTensor input_meta_tmp_skip_update(dt1);
-  if (input_skip_update_ptr) {
-    input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype());
-    input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims());
-    input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout());
-    input_meta_ref_skip_update = input_meta_tmp_skip_update;
-  }
+  auto input_meta_ref_skip_update = MakeMetaTensor(input_skip_update);
 
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
@@ -176,8 +146,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
-                                      paddle::optional<const phi::DenseTensor&>,
-                                      paddle::optional<const phi::DenseTensor&>,
+                                      const paddle::optional<phi::DenseTensor>&,
+                                      const paddle::optional<phi::DenseTensor>&,
                                       const Scalar&,
                                       const Scalar&,
                                       const Scalar&,
@@ -250,8 +220,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
                                       const phi::DenseTensor&,
-                                      paddle::optional<const phi::DenseTensor&>,
-                                      paddle::optional<const phi::DenseTensor&>,
+                                      const paddle::optional<phi::DenseTensor>&,
+                                      const paddle::optional<phi::DenseTensor>&,
                                       const Scalar&,
                                       const Scalar&,
                                       const Scalar&,
@@ -304,8 +274,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -350,11 +320,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
   auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
   auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {});
   auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {});
-  paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
-  auto input_master_param_ptr =
-      PrepareData(master_param, kernel.InputAt(7), {});
-  paddle::optional<const phi::DenseTensor&> input_skip_update(paddle::none);
-  auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {});
+  auto input_master_param = PrepareData(master_param, kernel.InputAt(7), {});
+  auto input_skip_update = PrepareData(skip_update, kernel.InputAt(8), {});
 
   std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> api_output;
   auto kernel_out_0 = input_param.get();
@@ -363,40 +330,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
   auto kernel_out_3 = input_beta1_pow.get();
   auto kernel_out_4 = input_beta2_pow.get();
   phi::DenseTensor* kernel_out_5 = nullptr;
-  if (input_master_param_ptr) {
-    input_master_param =
-        paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
-    kernel_out_5 =
-        paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
-            .get_ptr();
+  if (input_master_param) {
+    kernel_out_5 = input_master_param.get_ptr();
   }
 
-  if (input_skip_update_ptr) {
-    input_skip_update =
-        paddle::make_optional<const phi::DenseTensor&>(*input_skip_update_ptr);
-  }
-
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
-      paddle::none);
-  phi::DenseTensor dt;
-  phi::MetaTensor input_meta_tmp_master_param(dt);
-  if (input_master_param_ptr) {
-    input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
-    input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
-    input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
-    input_meta_ref_master_param = input_meta_tmp_master_param;
-  }
+  auto input_meta_ref_master_param = MakeMetaTensor(input_master_param);
 
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_skip_update(
-      paddle::none);
-  phi::DenseTensor dt1;
-  phi::MetaTensor input_meta_tmp_skip_update(dt1);
-  if (input_skip_update_ptr) {
-    input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype());
-    input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims());
-    input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout());
-    input_meta_ref_skip_update = input_meta_tmp_skip_update;
-  }
+  auto input_meta_ref_skip_update = MakeMetaTensor(input_skip_update);
 
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
@@ -439,8 +379,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
-                                    paddle::optional<const phi::DenseTensor&>,
-                                    paddle::optional<const phi::DenseTensor&>,
+                                    const paddle::optional<phi::DenseTensor>&,
+                                    const paddle::optional<phi::DenseTensor>&,
                                     const Scalar&,
                                     const Scalar&,
                                     const Scalar&,
@@ -760,7 +700,7 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     const Tensor& grad,
     const Tensor& velocity,
     const Tensor& learning_rate,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     float mu,
     bool use_nesterov,
     const std::string& regularization_method,
@@ -801,32 +741,18 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
   auto input_grad = PrepareData(grad, kernel.InputAt(1), {});
   auto input_velocity = PrepareData(velocity, kernel.InputAt(2), {});
   auto input_learning_rate = PrepareData(learning_rate, kernel.InputAt(3), {});
-  paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
-  auto input_master_param_ptr =
-      PrepareData(master_param, kernel.InputAt(4), {});
+  auto input_master_param = PrepareData(master_param, kernel.InputAt(4), {});
 
   std::tuple<Tensor, Tensor, Tensor> api_output;
   auto kernel_out_0 = input_param.get();
   auto kernel_out_1 = input_velocity.get();
   phi::DenseTensor* kernel_out_2 = nullptr;
-  if (input_master_param_ptr) {
-    input_master_param =
-        paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
-    kernel_out_2 =
-        paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
-            .get_ptr();
+  if (input_master_param) {
+    kernel_out_2 = input_master_param.get_ptr();
   }
 
-  paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
-      paddle::none);
-  phi::DenseTensor dt;
-  phi::MetaTensor input_meta_tmp_master_param(dt);
-  if (input_master_param_ptr) {
-    input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
-    input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
-    input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
-    input_meta_ref_master_param = input_meta_tmp_master_param;
-  }
+  auto input_meta_ref_master_param = MakeMetaTensor(input_master_param);
+
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
   if (kernel_out_2) {
@@ -867,7 +793,7 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
                                     const phi::DenseTensor&,
-                                    paddle::optional<const phi::DenseTensor&>,
+                                    const paddle::optional<phi::DenseTensor>&,
                                     float,
                                     bool,
                                     const std::string&,
@@ -902,7 +828,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
     const Tensor& param,
     const Tensor& learning_rate,
     const Tensor& grad,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     bool multi_precision) {
   DataType kernel_data_type = ParseDataType(param);
   auto kernel_key_set = ParseKernelKeyByInputArgs(param, learning_rate, grad);
@@ -940,17 +866,8 @@ std::tuple<Tensor, Tensor> sgd_impl(
 
   if (phi::DenseTensor::classof(param_tensor.get())) {
     auto in_param = PrepareData(param, kernel.InputAt(0), {});
-    auto in_master_param = PrepareData(master_param, kernel.InputAt(3), {});
-
-    paddle::optional<const phi::DenseTensor&> in_master_param_opt =
-        master_param
-            ? paddle::make_optional<const phi::DenseTensor&>(*in_master_param)
-            : paddle::none;
-    auto master_param_meta = MakeMetaTensor(in_master_param_opt);
-    paddle::optional<const phi::MetaTensor&> master_param_meta_opt =
-        master_param
-            ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta)
-            : paddle::none;
+    auto in_master_param_opt = PrepareData(master_param, kernel.InputAt(3), {});
+    auto master_param_meta_opt = MakeMetaTensor(in_master_param_opt);
 
     phi::DenseTensor* kernel_out_0 =
         SetKernelOutput(kernel_key.backend(), &std::get<0>(out));
@@ -974,7 +891,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
                    const phi::DenseTensor&,
                    const phi::DenseTensor&,
                    const phi::DenseTensor&,
-                   paddle::optional<const phi::DenseTensor&>,
+                   const paddle::optional<phi::DenseTensor>&,
                    bool,
                    phi::DenseTensor*,
                    phi::DenseTensor*);
@@ -1003,7 +920,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
                    const phi::DenseTensor&,
                    const phi::DenseTensor&,
                    const phi::SelectedRows&,
-                   paddle::optional<const phi::DenseTensor&>,
+                   const paddle::optional<phi::DenseTensor>&,
                    bool,
                    phi::DenseTensor*,
                    phi::DenseTensor*);
@@ -1020,16 +937,8 @@ std::tuple<Tensor, Tensor> sgd_impl(
   } else {
     auto in_param = TensorToSelectedRows(param);
     auto in_grad = TensorToSelectedRows(grad);
-    auto in_master_param = TensorToSelectedRows(master_param);
-    auto in_master_param_opt =
-        master_param
-            ? paddle::make_optional<const phi::SelectedRows&>(*in_master_param)
-            : paddle::none;
+    auto in_master_param_opt = TensorToSelectedRows(master_param);
     auto master_param_meta = MakeMetaTensor(in_master_param_opt);
-    paddle::optional<const phi::MetaTensor&> master_param_meta_opt =
-        master_param
-            ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta)
-            : paddle::none;
 
     phi::SelectedRows* kernel_out_0 =
         SetSelectedRowsKernelOutput(kernel_key.backend(), &std::get<0>(out));
@@ -1041,7 +950,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
     SgdInferMeta(MakeMetaTensor(*in_param),
                  MakeMetaTensor(*in_learning_rate),
                  MakeMetaTensor(*in_grad),
-                 master_param_meta_opt,
+                 master_param_meta,
                  multi_precision,
                  &meta_out_0,
                  &meta_out_1);
@@ -1051,7 +960,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
                  const phi::SelectedRows&,
                  const phi::DenseTensor&,
                  const phi::SelectedRows&,
-                 paddle::optional<const phi::SelectedRows&>,
+                 const paddle::optional<phi::SelectedRows>&,
                  bool,
                  phi::SelectedRows*,
                  phi::SelectedRows*);
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index d88a134654caf..f8ccbb36c5ca7 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -39,8 +39,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -57,8 +57,8 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
     const Tensor& moment2,
     const Tensor& beta1_pow,
     const Tensor& beta2_pow,
-    paddle::optional<const Tensor&> master_param,
-    paddle::optional<const Tensor&> skip_update,
+    const paddle::optional<Tensor>& master_param,
+    const paddle::optional<Tensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
@@ -107,7 +107,7 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     const Tensor& grad,
     const Tensor& velocity,
     const Tensor& learning_rate,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     float mu,
     bool use_nesterov,
     const std::string& regularization_method,
@@ -119,7 +119,7 @@ std::tuple<Tensor, Tensor> sgd_impl(
     const Tensor& param,
     const Tensor& learning_rate,
     const Tensor& grad,
-    paddle::optional<const Tensor&> master_param,
+    const paddle::optional<Tensor>& master_param,
     bool multi_precision);
 
 ////////////////// Backward(grad) api impls //////////////////////
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 2111829b8d60b..633bb1a32a133 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -23,10 +23,10 @@ std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
   return std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
-std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
-    const paddle::optional<const Tensor&>& tensor) {
+paddle::optional<phi::DenseTensor> TensorToDenseTensor(
+    const paddle::optional<Tensor>& tensor) {
   if (tensor) {
-    return std::static_pointer_cast<phi::DenseTensor>(tensor->impl());
+    return {*std::static_pointer_cast<phi::DenseTensor>(tensor->impl())};
   }
   return nullptr;
 }
@@ -48,10 +48,10 @@ std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
   return std::static_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
-std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const paddle::optional<const Tensor&>& tensor) {
+paddle::optional<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor) {
   if (tensor) {
-    return std::static_pointer_cast<phi::SelectedRows>(tensor->impl());
+    return {*std::static_pointer_cast<phi::SelectedRows>(tensor->impl())};
   }
   return nullptr;
 }
@@ -66,12 +66,12 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::DenseTensor&>& tensor) {
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::DenseTensor>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
   }
-  return {paddle::none};
+  return phi::MetaTensor();
 }
 
 std::vector<phi::MetaTensor> MakeMetaTensor(
@@ -98,12 +98,12 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::SelectedRows&>& tensor) {
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SelectedRows>& tensor) {
   if (tensor) {
     return {phi::MetaTensor(*tensor)};
   }
-  return {paddle::none};
+  return phi::MetaTensor();
 }
 
 phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 097178ae0d928..83656a7b528a6 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -32,7 +32,7 @@ enum class TensorType { DENSE_TENSOR, SPARSE_CSR, SPARSE_COO, STRING_TENSOR };
 
 std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor);
 
-std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
+paddle::optional<phi::DenseTensor> TensorToDenseTensor(
     const paddle::optional<Tensor>& tensor);
 
 std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
@@ -40,8 +40,8 @@ std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
 
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
 
-std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const paddle::optional<const Tensor&>& tensor);
+paddle::optional<phi::SelectedRows> TensorToSelectedRows(
+    const paddle::optional<Tensor>& tensor);
 
 std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
@@ -49,8 +49,8 @@ std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
 phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::DenseTensor&>& tensor);
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::DenseTensor>& tensor);
 
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<const phi::DenseTensor*>& tensors);
@@ -60,8 +60,8 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
 
 phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
 
-paddle::optional<phi::MetaTensor> MakeMetaTensor(
-    const paddle::optional<const phi::SelectedRows&>& tensor);
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SelectedRows>& tensor);
 
 phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor);
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index b00311061c9d0..598559cc4dffb 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -249,25 +249,14 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
   return nullptr;
 }
 
-std::shared_ptr<phi::DenseTensor> PrepareData(
+paddle::optional<phi::DenseTensor> PrepareData(
     const paddle::optional<Tensor>& input,
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag) {
   if (input) {
-    return PrepareData(*input, target_args_def, transform_flag);
-  }
-  return {nullptr};
-}
-
-std::shared_ptr<phi::DenseTensor> PrepareData(
-    const paddle::optional<const Tensor&> input,
-    const phi::TensorArgDef& target_args_def,
-    const TransformFlag& transform_flag) {
-  if (input.get_ptr() != nullptr) {
-    return PrepareData(*(input.get_ptr()), target_args_def, transform_flag);
+    return {*PrepareData(*input, target_args_def, transform_flag)};
   }
-
-  return {nullptr};
+  return paddle::none;
 }
 
 std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index f5537961d0ac7..4d70078ef3444 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -66,7 +66,7 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
-std::shared_ptr<phi::DenseTensor> PrepareData(
+paddle::optional<phi::DenseTensor> PrepareData(
     const paddle::optional<Tensor>& input,
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
@@ -76,10 +76,5 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag);
 
-std::shared_ptr<phi::DenseTensor> PrepareData(
-    const paddle::optional<const Tensor&> input,
-    const phi::TensorArgDef& target_args_def,
-    const TransformFlag& transform_flag);
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 29254a0486d00..1091e0556da8b 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -125,8 +125,8 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     key_set.dtype = tensor.dtype();
   }
 
-  void operator()(const paddle::optional<const Tensor&> x) {
-    if (x.get_ptr() != nullptr) {
+  void operator()(const paddle::optional<Tensor>& x) {
+    if (x) {
       const phi::TensorBase& tensor = *(x.get_ptr()->impl());
       AssignKernelKeySet(tensor);
     }
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index 1d61f55f9dcd2..9d2b85435c7f3 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -65,14 +65,6 @@ const MetaTensor& InferMetaContext::InputAt(size_t idx) const {
   return inputs_.at(idx);
 }
 
-paddle::optional<const MetaTensor&> InferMetaContext::OptionalInputAt(
-    size_t idx) const {
-  const auto& input = inputs_.at(idx);
-  return input.initialized()
-             ? paddle::optional<const MetaTensor&>{input}
-             : paddle::optional<const MetaTensor&>{paddle::none};
-}
-
 std::vector<const MetaTensor*> InferMetaContext::InputsBetween(
     size_t start, size_t end) const {
   std::vector<const MetaTensor*> result;
@@ -86,7 +78,7 @@ std::vector<const MetaTensor*> InferMetaContext::InputsBetween(
   return result;
 }
 
-paddle::optional<const std::vector<const MetaTensor*>>
+paddle::optional<std::vector<const MetaTensor*>>
 InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
   const auto& first = inputs_.at(start);
 
@@ -99,9 +91,9 @@ InferMetaContext::OptionalInputsBetween(size_t start, size_t end) const {
       result.emplace_back(in.initialized() ? &in : nullptr);
     }
 
-    return paddle::optional<const std::vector<const MetaTensor*>>(result);
+    return paddle::optional<std::vector<const MetaTensor*>>(std::move(result));
   }
-  return paddle::optional<const std::vector<const MetaTensor*>>(paddle::none);
+  return paddle::none;
 }
 
 MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index b974f2c868a8a..d27d8bc7624be 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -50,11 +50,10 @@ class InferMetaContext {
       paddle::small_vector<MetaTensor, phi::kOutputSmallVectorSize> outputs);
 
   virtual const MetaTensor& InputAt(size_t idx) const;
-  virtual paddle::optional<const MetaTensor&> OptionalInputAt(size_t idx) const;
 
   virtual std::vector<const MetaTensor*> InputsBetween(size_t start,
                                                        size_t end) const;
-  virtual paddle::optional<const std::vector<const MetaTensor*>>
+  virtual paddle::optional<std::vector<const MetaTensor*>>
   OptionalInputsBetween(size_t start, size_t end) const;
 
   virtual MetaTensor* MutableOutputAt(size_t idx);
@@ -151,24 +150,6 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
     }
   };
 
-  template <typename... Tail>
-  struct InferMetaFnCallHelper<paddle::optional<const MetaTensor&>, Tail...> {
-    template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
-    static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
-      static_assert(attr_idx == 0,
-                    "InferMeta's Input should appear before Attributes.");
-      static_assert(out_idx == 0,
-                    "InferMeta's Input should appear before Outputs.");
-      const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      auto arg = ctx->OptionalInputAt(range.first);
-
-      InferMetaFnCallHelper<
-          Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
-                                                                 pargs...,
-                                                                 arg);
-    }
-  };
-
   template <typename... Tail>
   struct InferMetaFnCallHelper<const std::vector<const MetaTensor*>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
@@ -189,7 +170,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
 
   template <typename... Tail>
   struct InferMetaFnCallHelper<
-      paddle::optional<const std::vector<const MetaTensor*>>,
+      const paddle::optional<std::vector<const MetaTensor*>>&,
       Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
@@ -198,7 +179,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
       static_assert(out_idx == 0,
                     "InferMeta's Input should appear before Outputs.");
       const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      paddle::optional<const std::vector<const MetaTensor*>> arg =
+      paddle::optional<std::vector<const MetaTensor*>> arg =
           ctx->OptionalInputsBetween(range.first, range.second);
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 8b43239d352b3..0f155f445ec9b 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -81,11 +81,11 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  paddle::optional<const TensorType&> OptionalInputAt(size_t idx) const {
-    const auto& input = inputs_.at(idx);
-    return input ? paddle::optional<const TensorType&>{static_cast<
-                       const TensorType&>(*input)}
-                 : paddle::optional<const TensorType&>{paddle::none};
+  paddle::optional<TensorType> OptionalInputAt(size_t idx) const {
+    const auto* input = inputs_.at(idx);
+    return input ? paddle::make_optional<TensorType>(
+                       *(static_cast<const TensorType*>(input)))
+                 : paddle::none;
   }
 
   template <typename TensorType>
@@ -99,7 +99,7 @@ class KernelContext {
   }
 
   template <typename TensorType>
-  paddle::optional<const std::vector<const TensorType*>> OptionalInputsBetween(
+  paddle::optional<std::vector<const TensorType*>> OptionalInputsBetween(
       size_t start, size_t end) {
     const auto& first = inputs_.at(start);
 
@@ -109,9 +109,9 @@ class KernelContext {
         auto* t = static_cast<const TensorType*>(inputs_.at(i));
         v.emplace_back(t);
       }
-      return paddle::optional<const std::vector<const TensorType*>>(v);
+      return paddle::optional<std::vector<const TensorType*>>(std::move(v));
     }
-    return paddle::optional<const std::vector<const TensorType*>>(paddle::none);
+    return paddle::none;
   }
 
   template <typename TensorType>
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 36ab9c081cc37..41e1e2b53a9e9 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -76,20 +76,20 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_key.dtype(),
                               arg_type);
       } else if (arg_type == std::type_index(typeid(
-                                 paddle::optional<const DenseTensor&>))) {
+                                 const paddle::optional<DenseTensor>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-      } else if (arg_type == std::type_index(typeid(
-                                 paddle::optional<
-                                     const std::vector<const DenseTensor*>>))) {
+      } else if (arg_type ==
+                 std::type_index(typeid(const paddle::optional<
+                                        std::vector<const DenseTensor*>>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
       } else if (arg_type == std::type_index(typeid(
-                                 paddle::optional<const SelectedRows&>))) {
+                                 const paddle::optional<SelectedRows>&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index f548d1da2d4e7..d4765d1c4c3b4 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -85,7 +85,7 @@ namespace phi {
 
 #define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type)     \
   template <typename... Tail>                                              \
-  struct KernelCallHelper<paddle::optional<const tensor_type&>, Tail...> { \
+  struct KernelCallHelper<const paddle::optional<tensor_type>&, Tail...> { \
     template <int dev_ctx_idx,                                             \
               int in_idx,                                                  \
               int attr_idx,                                                \
@@ -129,7 +129,7 @@ namespace phi {
 #define PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(tensor_type)  \
   template <typename... Tail>                                                 \
   struct KernelCallHelper<                                                    \
-      paddle::optional<const std::vector<const tensor_type*>>,                \
+      const paddle::optional<std::vector<const tensor_type*>>&,               \
       Tail...> {                                                              \
     template <int dev_ctx_idx,                                                \
               int in_idx,                                                     \
@@ -142,7 +142,7 @@ namespace phi {
       static_assert(out_idx == 0,                                             \
                     "Kernel's Input should appear before Outputs.");          \
       const std::pair<int, int>& range = ctx->InputRangeAt(in_idx);           \
-      paddle::optional<const std::vector<const tensor_type*>> arg =           \
+      paddle::optional<std::vector<const tensor_type*>> arg =                 \
           ctx->OptionalInputsBetween<tensor_type>(range.first, range.second); \
       KernelCallHelper<Tail...>::                                             \
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(       \
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 29afe0d0292d1..d277f32d8ea9a 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -39,7 +39,9 @@ struct MetaConfig {
 
 class MetaTensor {
  public:
-  MetaTensor() = default;
+  typedef void (*unspecified_bool_type)();
+
+  MetaTensor() : tensor_(nullptr) {}
 
   // supporting implicit construction is easier to use
   MetaTensor(TensorBase* tensor) : tensor_(tensor) {}  // NOLINT
@@ -68,12 +70,22 @@ class MetaTensor {
 
   virtual bool initialized() const;
 
+  virtual operator unspecified_bool_type() const {
+    return tensor_ == nullptr ? 0 : unspecified_bool_true;
+  }
+
+  virtual bool operator!() const { return tensor_ == nullptr; }
+
+ protected:
+  static void unspecified_bool_true() {}
+
  private:
   // Because the lod in compiletime and runtime is different,
   // so `LoD` cannot in public methods
   const LoD& lod() const;
   TensorBase* tensor() const;
-  TensorBase* tensor_;
+
+  TensorBase* tensor_ = nullptr;
 };
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 6b13a28c70837..78f8ff9e00ce5 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -188,7 +188,7 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label,
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
-                                 paddle::optional<const MetaTensor&> mask,
+                                 const MetaTensor& mask,
                                  const MetaTensor& out_grad,
                                  const std::vector<int>& strides,
                                  const std::vector<int>& paddings,
@@ -202,7 +202,7 @@ void DeformableConvGradInferMeta(const MetaTensor& x,
                                  MetaTensor* mask_grad) {
   GeneralTernaryGradInferMeta(x, offset, filter, dx, offset_grad, filter_grad);
   if (mask) {
-    UnchangedInferMeta(mask.get(), mask_grad);
+    UnchangedInferMeta(mask, mask_grad);
   }
 }
 
@@ -314,7 +314,7 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
 
 void InstanceNormGradInferMeta(const MetaTensor& x,
                                const MetaTensor& y_grad,
-                               paddle::optional<const MetaTensor&> scale,
+                               const MetaTensor& scale,
                                const MetaTensor& saved_mean,
                                const MetaTensor& saved_variance,
                                float epsilon,
@@ -338,19 +338,18 @@ void InstanceNormGradInferMeta(const MetaTensor& x,
     bias_grad->set_dims({C});
   }
 }
-void InstanceNormDoubleGradInferMeta(
-    const MetaTensor& x,
-    paddle::optional<const MetaTensor&> scale,
-    const MetaTensor& saved_mean,
-    const MetaTensor& saved_variance,
-    const MetaTensor& dy,
-    paddle::optional<const MetaTensor&> ddx,
-    paddle::optional<const MetaTensor&> ddscale,
-    paddle::optional<const MetaTensor&> ddbias,
-    float epsilon,
-    MetaTensor* dx,
-    MetaTensor* dscale,
-    MetaTensor* ddy) {
+void InstanceNormDoubleGradInferMeta(const MetaTensor& x,
+                                     const MetaTensor& scale,
+                                     const MetaTensor& saved_mean,
+                                     const MetaTensor& saved_variance,
+                                     const MetaTensor& dy,
+                                     const MetaTensor& ddx,
+                                     const MetaTensor& ddscale,
+                                     const MetaTensor& ddbias,
+                                     float epsilon,
+                                     MetaTensor* dx,
+                                     MetaTensor* dscale,
+                                     MetaTensor* ddy) {
   PADDLE_ENFORCE_NE(
       dx,
       nullptr,
@@ -436,7 +435,7 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
 
 void NllLossGradInferMeta(const MetaTensor& x,
                           const MetaTensor& label,
-                          paddle::optional<const MetaTensor&> weight,
+                          const MetaTensor& weight,
                           const MetaTensor& total_weight,
                           const MetaTensor& out_grad,
                           int64_t ignore_index,
@@ -549,7 +548,7 @@ void PoolGradInferMeta(const MetaTensor& x,
 
 void PsroiPoolGradInferMeta(const MetaTensor& x,
                             const MetaTensor& rois,
-                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& rois_num,
                             const MetaTensor& dout,
                             int pooled_height,
                             int pooled_width,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 855b25d7ed4f8..b52734eb5b10c 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -87,7 +87,7 @@ void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label,
 void DeformableConvGradInferMeta(const MetaTensor& x,
                                  const MetaTensor& offset,
                                  const MetaTensor& filter,
-                                 paddle::optional<const MetaTensor&> mask,
+                                 const MetaTensor& mask,
                                  const MetaTensor& out_grad,
                                  const std::vector<int>& strides,
                                  const std::vector<int>& paddings,
@@ -146,7 +146,7 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
 
 void InstanceNormGradInferMeta(const MetaTensor& x,
                                const MetaTensor& y_grad,
-                               paddle::optional<const MetaTensor&> scale,
+                               const MetaTensor& scale,
                                const MetaTensor& saved_mean,
                                const MetaTensor& saved_variance,
                                float epsilon,
@@ -154,19 +154,18 @@ void InstanceNormGradInferMeta(const MetaTensor& x,
                                MetaTensor* scale_grad,
                                MetaTensor* bias_grad);
 
-void InstanceNormDoubleGradInferMeta(
-    const MetaTensor& x,
-    paddle::optional<const MetaTensor&> scale,
-    const MetaTensor& saved_mean,
-    const MetaTensor& saved_variance,
-    const MetaTensor& dy,
-    paddle::optional<const MetaTensor&> ddx,
-    paddle::optional<const MetaTensor&> ddscale,
-    paddle::optional<const MetaTensor&> ddbias,
-    float epsilon,
-    MetaTensor* dx,
-    MetaTensor* dscale,
-    MetaTensor* ddy);
+void InstanceNormDoubleGradInferMeta(const MetaTensor& x,
+                                     const MetaTensor& scale,
+                                     const MetaTensor& saved_mean,
+                                     const MetaTensor& saved_variance,
+                                     const MetaTensor& dy,
+                                     const MetaTensor& ddx,
+                                     const MetaTensor& ddscale,
+                                     const MetaTensor& ddbias,
+                                     float epsilon,
+                                     MetaTensor* dx,
+                                     MetaTensor* dscale,
+                                     MetaTensor* ddy);
 
 void KernelWithXShapeInferMeta(const MetaTensor& xshape, MetaTensor* dx);
 
@@ -194,7 +193,7 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
 
 void NllLossGradInferMeta(const MetaTensor& input,
                           const MetaTensor& label,
-                          paddle::optional<const MetaTensor&> weight,
+                          const MetaTensor& weight,
                           const MetaTensor& total_weight,
                           const MetaTensor& out_grad,
                           int64_t ignore_index,
@@ -209,7 +208,7 @@ void PixelUnshuffleGradInferMeta(const MetaTensor& out_grad,
 
 void PsroiPoolGradInferMeta(const MetaTensor& x,
                             const MetaTensor& rois,
-                            paddle::optional<const MetaTensor&> rois_num,
+                            const MetaTensor& rois_num,
                             const MetaTensor& dout,
                             int pooled_height,
                             int pooled_width,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 837a43905e723..76b6fcdd52efc 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -201,7 +201,7 @@ void BCELossInferMeta(const MetaTensor& input,
 }
 
 void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
+                       const MetaTensor& weights,
                        int minlength,
                        MetaTensor* out) {
   auto input_dim = x.dims();
@@ -220,8 +220,10 @@ void BincountInferMeta(const MetaTensor& x,
                                    "But the dimension of Input(X) is [%d]",
                                    input_dim.size()));
 
-  if (weights.is_initialized()) {
-    auto weights_dim = weights->dims();
+  VLOG(1) << "####### CHECK weights";
+  if (weights) {
+    auto weights_dim = weights.dims();
+    VLOG(1) << "##### weights_dim " << weights_dim;
     PADDLE_ENFORCE_EQ(weights_dim.size(),
                       1,
                       phi::errors::InvalidArgument(
@@ -241,8 +243,8 @@ void BincountInferMeta(const MetaTensor& x,
             input_dim));
   }
   out->set_dims(phi::make_ddim({-1}));
-  if (weights.is_initialized()) {
-    out->set_dtype(weights->dtype());
+  if (weights) {
+    out->set_dtype(weights.dtype());
   } else {
     out->set_dtype(x.dtype());
   }
@@ -864,7 +866,7 @@ void DistInferMeta(const MetaTensor& x,
 }
 
 void DropoutInferMeta(const MetaTensor& x,
-                      paddle::optional<const MetaTensor&> seed_tensor,
+                      const MetaTensor& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
@@ -982,7 +984,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
 }
 
 void ExpandAsInferMeta(const MetaTensor& x,
-                       paddle::optional<const MetaTensor&> y,
+                       const MetaTensor& y,
                        const std::vector<int>& target_shape,
                        MetaTensor* out) {
 #define MAX_RANK_SUPPORTED 6
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 192fa214c905f..0c86e5389c4b4 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -56,7 +56,7 @@ void BCELossInferMeta(const MetaTensor& input,
                       MetaConfig config = MetaConfig());
 
 void BincountInferMeta(const MetaTensor& x,
-                       const paddle::optional<const MetaTensor&> weights,
+                       const MetaTensor& weights,
                        int minlength,
                        MetaTensor* out);
 
@@ -136,7 +136,7 @@ void DistInferMeta(const MetaTensor& x,
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out);
 
 void DropoutInferMeta(const MetaTensor& x,
-                      paddle::optional<const MetaTensor&> seed_tensor,
+                      const MetaTensor& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
@@ -155,7 +155,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              MetaTensor* out);
 
 void ExpandAsInferMeta(const MetaTensor& x,
-                       paddle::optional<const MetaTensor&> y,
+                       const MetaTensor& y,
                        const std::vector<int>& target_shape,
                        MetaTensor* out);
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 48c40673ab819..63f0d0c1eeb28 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -100,8 +100,8 @@ void AdamInferMeta(const MetaTensor& param,
                    const MetaTensor& moment2,
                    const MetaTensor& beta1_pow,
                    const MetaTensor& beta2_pow,
-                   paddle::optional<const MetaTensor&> master_param,
-                   paddle::optional<const MetaTensor&> skip_update,
+                   const MetaTensor& master_param,
+                   const MetaTensor& skip_update,
                    const Scalar& beta1,
                    const Scalar& beta2,
                    const Scalar& epsilon,
@@ -238,8 +238,8 @@ void AdamwInferMeta(const MetaTensor& param,
                     const MetaTensor& moment2,
                     const MetaTensor& beta1_pow,
                     const MetaTensor& beta2_pow,
-                    paddle::optional<const MetaTensor&> master_param,
-                    paddle::optional<const MetaTensor&> skip_update,
+                    const MetaTensor& master_param,
+                    const MetaTensor& skip_update,
                     const Scalar& beta1,
                     const Scalar& beta2,
                     const Scalar& epsilon,
@@ -580,7 +580,7 @@ void BatchNormInferInferMeta(const MetaTensor& x,
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
-                                    paddle::optional<const MetaTensor&> bias,
+                                    const MetaTensor& bias,
                                     MetaTensor* out,
                                     MetaConfig config) {
   auto x_dims = x.dims();
@@ -619,8 +619,8 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                         "The second dimension of input(Y) must be equal to "
                         "the third dimension of the input(Weight)."));
 
-  if (bias.get_ptr()) {
-    auto bias_dims = bias->dims();
+  if (bias) {
+    auto bias_dims = bias.dims();
     PADDLE_ENFORCE_EQ(bias_dims.size(),
                       2UL,
                       errors::InvalidArgument(
@@ -772,7 +772,7 @@ inline int ConvOutputSize(
 void DeformableConvInferMeta(const MetaTensor& x,
                              const MetaTensor& offset,
                              const MetaTensor& filter,
-                             paddle::optional<const MetaTensor&> mask,
+                             const MetaTensor& mask,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              const std::vector<int>& dilations,
@@ -918,7 +918,7 @@ void DeformableConvInferMeta(const MetaTensor& x,
             deformable_groups));
 
     if (mask) {
-      auto mask_dims = mask->dims();
+      auto mask_dims = mask.dims();
       PADDLE_ENFORCE_EQ(output_shape[2],
                         mask_dims[2],
                         phi::errors::InvalidArgument(
@@ -958,9 +958,9 @@ void DeformableConvInferMeta(const MetaTensor& x,
 void HierarchicalSigmoidInferMeta(const MetaTensor& x,
                                   const MetaTensor& w,
                                   const MetaTensor& label,
-                                  paddle::optional<const MetaTensor&> path,
-                                  paddle::optional<const MetaTensor&> code,
-                                  paddle::optional<const MetaTensor&> bias,
+                                  const MetaTensor& path,
+                                  const MetaTensor& code,
+                                  const MetaTensor& bias,
                                   int num_classes,
                                   bool remote_prefetch,
                                   int trainer_id,
@@ -991,9 +991,9 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x,
 
 static void Interpolate1DInferShapeCheck(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1048,7 +1048,7 @@ static void Interpolate1DInferShapeCheck(
 
   int out_w_tmp;
   if (scale_tensor) {
-    auto scale_tensor_dim = scale_tensor->dims();
+    auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
         scale_tensor_dim.size(),
         1,
@@ -1086,7 +1086,7 @@ static void Interpolate1DInferShapeCheck(
   }
 
   if (out_size && config.is_runtime) {
-    auto out_size_dim = out_size->dims();
+    auto out_size_dim = out_size.dims();
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
@@ -1118,9 +1118,9 @@ static void Interpolate1DInferShapeCheck(
 
 static void Interpolate2DInferShapeCheck(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1178,7 +1178,7 @@ static void Interpolate2DInferShapeCheck(
 
   int out_h_tmp, out_w_tmp;
   if (scale_tensor) {
-    auto scale_tensor_dim = scale_tensor->dims();
+    auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
         scale_tensor_dim.size(),
         1,
@@ -1231,7 +1231,7 @@ static void Interpolate2DInferShapeCheck(
   }
 
   if (out_size && config.is_runtime) {
-    auto out_size_dim = out_size->dims();
+    auto out_size_dim = out_size.dims();
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
@@ -1263,9 +1263,9 @@ static void Interpolate2DInferShapeCheck(
 
 static void Interpolate3DInferShapeCheck(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1321,7 +1321,7 @@ static void Interpolate3DInferShapeCheck(
 
   int out_d_tmp, out_h_tmp, out_w_tmp;
   if (scale_tensor) {
-    auto scale_tensor_dim = scale_tensor->dims();
+    auto scale_tensor_dim = scale_tensor.dims();
     PADDLE_ENFORCE_EQ(
         scale_tensor_dim.size(),
         1,
@@ -1389,7 +1389,7 @@ static void Interpolate3DInferShapeCheck(
   }
 
   if (out_size && config.is_runtime) {
-    auto out_size_dim = out_size->dims();
+    auto out_size_dim = out_size.dims();
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(),
         1,
@@ -1419,9 +1419,9 @@ static void Interpolate3DInferShapeCheck(
 
 void InterpolateInferMeta(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1546,7 +1546,7 @@ void MomentumInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& velocity,
                        const MetaTensor& learning_rate,
-                       paddle::optional<const MetaTensor&> master_param,
+                       const MetaTensor& master_param,
                        float mu,
                        bool use_nesterov,
                        const std::string& regularization_method,
@@ -1709,7 +1709,7 @@ void MultiplexInferMeta(const std::vector<const MetaTensor*>& ins,
 
 void PsroiPoolInferMeta(const MetaTensor& x,
                         const MetaTensor& rois,
-                        paddle::optional<const MetaTensor&> rois_num,
+                        const MetaTensor& rois_num,
                         int pooled_height,
                         int pooled_width,
                         int output_channels,
@@ -1732,8 +1732,8 @@ void PsroiPoolInferMeta(const MetaTensor& x,
                     errors::InvalidArgument(
                         "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
                         "given as [(x1, y1, x2, y2), ...]"));
-  if (rois_num.get_ptr()) {
-    auto rois_num_dims = rois_num->dims();
+  if (rois_num) {
+    auto rois_num_dims = rois_num.dims();
     PADDLE_ENFORCE_EQ(
         rois_num_dims.size(),
         1,
@@ -1787,7 +1787,7 @@ void RmspropInferMeta(const MetaTensor& param,
                       const MetaTensor& grad,
                       const MetaTensor& moment,
                       const MetaTensor& learning_rate,
-                      paddle::optional<const MetaTensor&> mean_grad,
+                      const MetaTensor& mean_grad,
                       float epsilon,
                       float decay,
                       float momentum,
@@ -1837,14 +1837,14 @@ void RmspropInferMeta(const MetaTensor& param,
   mean_square_out->set_dtype(mean_square.dtype());
   if (centered) {
     mean_grad_out->set_dims(param_dim);
-    mean_grad_out->set_dtype(mean_grad.get_ptr()->dtype());
+    mean_grad_out->set_dtype(mean_grad.dtype());
   }
 }
 
 void RnnInferMeta(const MetaTensor& x,
                   const std::vector<const MetaTensor*>& pre_state,
                   const std::vector<const MetaTensor*>& weight_list,
-                  paddle::optional<const MetaTensor&> sequence_length,
+                  const MetaTensor& sequence_length,
                   float dropout_prob,
                   bool is_bidirec,
                   int input_size,
@@ -1867,7 +1867,7 @@ void RnnInferMeta(const MetaTensor& x,
                                    in_dims.size()));
 
   if (sequence_length) {
-    auto seq_dims = sequence_length->dims();
+    auto seq_dims = sequence_length.dims();
     PADDLE_ENFORCE_EQ(
         in_dims[1],
         seq_dims[0],
@@ -1929,7 +1929,7 @@ void RnnInferMeta(const MetaTensor& x,
 void SgdInferMeta(const MetaTensor& param,
                   const MetaTensor& learning_rate,
                   const MetaTensor& grad,
-                  paddle::optional<const MetaTensor&> master_param,
+                  const MetaTensor& master_param,
                   bool multi_precision,
                   MetaTensor* param_out,
                   MetaTensor* master_param_out) {
@@ -2006,8 +2006,8 @@ void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
 
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
-                      const paddle::optional<const MetaTensor&> logits_length,
-                      const paddle::optional<const MetaTensor&> labels_length,
+                      const MetaTensor& logits_length,
+                      const MetaTensor& labels_length,
                       int blank,
                       bool norm_by_times,
                       MetaTensor* warpctc_grad,
@@ -2015,7 +2015,7 @@ void WarpctcInferMeta(const MetaTensor& logits,
   auto logits_dims = logits.dims();
   int sequence_width = 0;
 
-  if (logits_length.is_initialized()) {
+  if (logits_length) {
     sequence_width = logits_dims[2];
   } else {
     sequence_width =
@@ -2069,8 +2069,8 @@ void WhereInferMeta(const MetaTensor& condition,
 void GraphReindexInferMeta(const MetaTensor& x,
                            const MetaTensor& neighbors,
                            const MetaTensor& count,
-                           paddle::optional<const MetaTensor&> hashtable_value,
-                           paddle::optional<const MetaTensor&> hashtable_index,
+                           const MetaTensor& hashtable_value,
+                           const MetaTensor& hashtable_index,
                            bool flag_buffer_hashtable,
                            MetaTensor* reindex_src,
                            MetaTensor* reindex_dst,
@@ -2100,8 +2100,8 @@ void GraphReindexInferMeta(const MetaTensor& x,
   GraphReindexShapeCheck(neighbors.dims(), "Neighbors");
   GraphReindexShapeCheck(count.dims(), "Count");
   if (flag_buffer_hashtable) {
-    GraphReindexShapeCheck(hashtable_value->dims(), "HashTable_Value");
-    GraphReindexShapeCheck(hashtable_index->dims(), "HashTable_Index");
+    GraphReindexShapeCheck(hashtable_value.dims(), "HashTable_Value");
+    GraphReindexShapeCheck(hashtable_index.dims(), "HashTable_Index");
   }
 
   reindex_src->set_dims({-1});
@@ -2112,18 +2112,17 @@ void GraphReindexInferMeta(const MetaTensor& x,
   out_nodes->set_dtype(x.dtype());
 }
 
-void GraphSampleNeighborsInferMeta(
-    const MetaTensor& row,
-    const MetaTensor& col_ptr,
-    const MetaTensor& x,
-    paddle::optional<const MetaTensor&> eids,
-    paddle::optional<const MetaTensor&> perm_buffer,
-    int sample_size,
-    bool return_eids,
-    bool flag_perm_buffer,
-    MetaTensor* out,
-    MetaTensor* out_count,
-    MetaTensor* out_eids) {
+void GraphSampleNeighborsInferMeta(const MetaTensor& row,
+                                   const MetaTensor& col_ptr,
+                                   const MetaTensor& x,
+                                   const MetaTensor& eids,
+                                   const MetaTensor& perm_buffer,
+                                   int sample_size,
+                                   bool return_eids,
+                                   bool flag_perm_buffer,
+                                   MetaTensor* out,
+                                   MetaTensor* out_count,
+                                   MetaTensor* out_eids) {
   // GSN: GraphSampleNeighbors
   auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) {
     if (dims.size() == 2) {
@@ -2149,12 +2148,12 @@ void GraphSampleNeighborsInferMeta(
   GSNShapeCheck(col_ptr.dims(), "Col_Ptr");
   GSNShapeCheck(x.dims(), "X");
   if (return_eids) {
-    GSNShapeCheck(eids->dims(), "Eids");
+    GSNShapeCheck(eids.dims(), "Eids");
     out_eids->set_dims({-1});
     out_eids->set_dtype(row.dtype());
   }
   if (flag_perm_buffer) {
-    GSNShapeCheck(perm_buffer->dims(), "Perm_Buffer");
+    GSNShapeCheck(perm_buffer.dims(), "Perm_Buffer");
   }
 
   out->set_dims({-1});
@@ -2166,7 +2165,7 @@ void GraphSampleNeighborsInferMeta(
 void Yolov3LossInferMeta(const MetaTensor& x,
                          const MetaTensor& gt_box,
                          const MetaTensor& gt_label,
-                         const paddle::optional<const MetaTensor&> gt_score,
+                         const MetaTensor& gt_score,
                          const std::vector<int>& anchors,
                          const std::vector<int>& anchor_mask,
                          int class_num,
@@ -2271,8 +2270,8 @@ void Yolov3LossInferMeta(const MetaTensor& x,
                         "But received class_num(%s) < 0",
                         class_num));
 
-  if (gt_score.get_ptr()) {
-    auto dim_gtscore = gt_score->dims();
+  if (gt_score) {
+    auto dim_gtscore = gt_score.dims();
     PADDLE_ENFORCE_EQ(
         dim_gtscore.size(),
         2,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 65b5819b602ba..54c6fccceb9c1 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -76,8 +76,8 @@ void AdamInferMeta(const MetaTensor& param,
                    const MetaTensor& moment2,
                    const MetaTensor& beta1_pow,
                    const MetaTensor& beta2_pow,
-                   paddle::optional<const MetaTensor&> master_param,
-                   paddle::optional<const MetaTensor&> skip_update,
+                   const MetaTensor& master_param,
+                   const MetaTensor& skip_update,
                    const Scalar& beta1,
                    const Scalar& beta2,
                    const Scalar& epsilon,
@@ -99,8 +99,8 @@ void AdamwInferMeta(const MetaTensor& param,
                     const MetaTensor& moment2,
                     const MetaTensor& beta1_pow,
                     const MetaTensor& beta2_pow,
-                    paddle::optional<const MetaTensor&> master_param,
-                    paddle::optional<const MetaTensor&> skip_update,
+                    const MetaTensor& master_param,
+                    const MetaTensor& skip_update,
                     const Scalar& beta1,
                     const Scalar& beta2,
                     const Scalar& epsilon,
@@ -170,7 +170,7 @@ void BatchNormInferInferMeta(const MetaTensor& x,
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
-                                    paddle::optional<const MetaTensor&> bias,
+                                    const MetaTensor& bias,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
@@ -185,7 +185,7 @@ void ConcatInferMeta(const std::vector<const MetaTensor*>& x,
 void DeformableConvInferMeta(const MetaTensor& x,
                              const MetaTensor& offset,
                              const MetaTensor& filter,
-                             paddle::optional<const MetaTensor&> mask,
+                             const MetaTensor& mask,
                              const std::vector<int>& strides,
                              const std::vector<int>& paddings,
                              const std::vector<int>& dilations,
@@ -198,9 +198,9 @@ void DeformableConvInferMeta(const MetaTensor& x,
 void HierarchicalSigmoidInferMeta(const MetaTensor& x,
                                   const MetaTensor& w,
                                   const MetaTensor& label,
-                                  paddle::optional<const MetaTensor&> path,
-                                  paddle::optional<const MetaTensor&> code,
-                                  paddle::optional<const MetaTensor&> bias,
+                                  const MetaTensor& path,
+                                  const MetaTensor& code,
+                                  const MetaTensor& bias,
                                   int num_classes,
                                   bool remote_prefetch,
                                   int trainer_id,
@@ -214,9 +214,9 @@ void HierarchicalSigmoidInferMeta(const MetaTensor& x,
 
 void InterpolateInferMeta(
     const MetaTensor& x,
-    paddle::optional<const MetaTensor&> out_size,
-    paddle::optional<const std::vector<const MetaTensor*>> size_tensor,
-    paddle::optional<const MetaTensor&> scale_tensor,
+    const MetaTensor& out_size,
+    const paddle::optional<std::vector<const MetaTensor*>>& size_tensor,
+    const MetaTensor& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -241,7 +241,7 @@ void MomentumInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& velocity,
                        const MetaTensor& learning_rate,
-                       paddle::optional<const MetaTensor&> master_param,
+                       const MetaTensor& master_param,
                        float mu,
                        bool use_nesterov,
                        const std::string& regularization_method,
@@ -261,7 +261,7 @@ void MultiplexInferMeta(const std::vector<const MetaTensor*>& ins,
 
 void PsroiPoolInferMeta(const MetaTensor& x,
                         const MetaTensor& rois,
-                        paddle::optional<const MetaTensor&> rois_num,
+                        const MetaTensor& rois_num,
                         int pooled_height,
                         int pooled_width,
                         int output_channels,
@@ -273,7 +273,7 @@ void RmspropInferMeta(const MetaTensor& param,
                       const MetaTensor& grad,
                       const MetaTensor& moment,
                       const MetaTensor& learning_rate,
-                      paddle::optional<const MetaTensor&> mean_grad,
+                      const MetaTensor& mean_grad,
                       float epsilon,
                       float decay,
                       float momentum,
@@ -286,7 +286,7 @@ void RmspropInferMeta(const MetaTensor& param,
 void RnnInferMeta(const MetaTensor& x,
                   const std::vector<const MetaTensor*>& pre_state,
                   const std::vector<const MetaTensor*>& weight_list,
-                  paddle::optional<const MetaTensor&> sequence_length,
+                  const MetaTensor& sequence_length,
                   float dropout_prob,
                   bool is_bidirec,
                   int input_size,
@@ -303,7 +303,7 @@ void RnnInferMeta(const MetaTensor& x,
 void SgdInferMeta(const MetaTensor& param,
                   const MetaTensor& learning_rate,
                   const MetaTensor& grad,
-                  paddle::optional<const MetaTensor&> master_param,
+                  const MetaTensor& master_param,
                   bool multi_precision,
                   MetaTensor* param_out,
                   MetaTensor* master_param_out);
@@ -317,8 +317,8 @@ void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
 
 void WarpctcInferMeta(const MetaTensor& logits,
                       const MetaTensor& label,
-                      const paddle::optional<const MetaTensor&> logits_length,
-                      const paddle::optional<const MetaTensor&> labels_length,
+                      const MetaTensor& logits_length,
+                      const MetaTensor& labels_length,
                       int blank,
                       bool norm_by_times,
                       MetaTensor* warpctc_grad,
@@ -332,30 +332,29 @@ void WhereInferMeta(const MetaTensor& condition,
 void GraphReindexInferMeta(const MetaTensor& x,
                            const MetaTensor& neighbors,
                            const MetaTensor& count,
-                           paddle::optional<const MetaTensor&> hashtable_value,
-                           paddle::optional<const MetaTensor&> hashtable_index,
+                           const MetaTensor& hashtable_value,
+                           const MetaTensor& hashtable_index,
                            bool flag_buffer_hashtable,
                            MetaTensor* reindex_src,
                            MetaTensor* reindex_dst,
                            MetaTensor* out_nodes);
 
-void GraphSampleNeighborsInferMeta(
-    const MetaTensor& row,
-    const MetaTensor& col_ptr,
-    const MetaTensor& x,
-    paddle::optional<const MetaTensor&> eids,
-    paddle::optional<const MetaTensor&> perm_buffer,
-    int sample_size,
-    bool return_eids,
-    bool flag_perm_buffer,
-    MetaTensor* out,
-    MetaTensor* out_count,
-    MetaTensor* out_eids);
+void GraphSampleNeighborsInferMeta(const MetaTensor& row,
+                                   const MetaTensor& col_ptr,
+                                   const MetaTensor& x,
+                                   const MetaTensor& eids,
+                                   const MetaTensor& perm_buffer,
+                                   int sample_size,
+                                   bool return_eids,
+                                   bool flag_perm_buffer,
+                                   MetaTensor* out,
+                                   MetaTensor* out_count,
+                                   MetaTensor* out_eids);
 
 void Yolov3LossInferMeta(const MetaTensor& x,
                          const MetaTensor& gt_box,
                          const MetaTensor& gt_label,
-                         const paddle::optional<const MetaTensor&> gt_score,
+                         const MetaTensor& gt_score,
                          const std::vector<int>& anchors,
                          const std::vector<int>& anchor_mask,
                          int class_num,
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index e3f946b247f09..58ae6b2058f9b 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -192,8 +192,8 @@ void ArangeInferMeta(const MetaTensor& start,
 }
 
 void InstanceNormInferMeta(const MetaTensor& x,
-                           paddle::optional<const MetaTensor&> scale,
-                           paddle::optional<const MetaTensor&> bias,
+                           const MetaTensor& scale,
+                           const MetaTensor& bias,
                            float epsilon,
                            MetaTensor* y,
                            MetaTensor* saved_mean,
@@ -242,9 +242,8 @@ void InstanceNormInferMeta(const MetaTensor& x,
   auto N = x_dims[0];
   auto C = x_dims[1];
   auto NxC = N * C;
-  const auto scale_ptr = scale.get_ptr();
-  if (scale_ptr) {
-    auto scale_dim = scale_ptr->dims();
+  if (scale) {
+    auto scale_dim = scale.dims();
     PADDLE_ENFORCE_EQ(
         scale_dim.size(),
         1UL,
@@ -265,9 +264,8 @@ void InstanceNormInferMeta(const MetaTensor& x,
                             scale_dim[0]));
     }
   }
-  const auto bias_ptr = bias.get_ptr();
-  if (bias_ptr) {
-    auto bias_dim = bias_ptr->dims();
+  if (bias) {
+    auto bias_dim = bias.dims();
     PADDLE_ENFORCE_EQ(
         bias_dim.size(),
         1UL,
@@ -365,8 +363,8 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
 }
 
 void LayerNormInferMeta(const MetaTensor& x,
-                        paddle::optional<const MetaTensor&> scale,
-                        paddle::optional<const MetaTensor&> bias,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
                         float epsilon,
                         int begin_norm_axis,
                         bool is_test,
@@ -388,19 +386,19 @@ void LayerNormInferMeta(const MetaTensor& x,
   auto matrix_dim = phi::flatten_to_2d(x_dim, begin_norm_axis);
   int left = static_cast<int>(matrix_dim[0]);
   int right = static_cast<int>(matrix_dim[1]);
-  if (scale.get_ptr() != nullptr) {
-    PADDLE_ENFORCE_EQ(scale->dims().size(),
+  if (scale) {
+    PADDLE_ENFORCE_EQ(scale.dims().size(),
                       1,
                       phi::errors::InvalidArgument(
                           "The dimensions of Input(Scale) must be 1, but "
                           "received dimensions of"
                           "Input(Scale) is [%d]",
-                          scale->dims().size()));
+                          scale.dims().size()));
   }
 
-  if (config.is_runtime && scale.get_ptr() != nullptr) {
+  if (config.is_runtime && scale) {
     PADDLE_ENFORCE_EQ(
-        scale->dims()[0],
+        scale.dims()[0],
         right,
         phi::errors::InvalidArgument(
             "The first dimension value of Input(Scale) must equal to be the"
@@ -408,21 +406,21 @@ void LayerNormInferMeta(const MetaTensor& x,
             "But received the first dimension value of Input(Scale) is"
             "[%d], the second dimension value of the flattened 2D matrix of"
             " Input(Scale) is [%d].",
-            scale->dims()[0],
+            scale.dims()[0],
             right));
   }
-  if (bias.get_ptr() != nullptr) {
-    PADDLE_ENFORCE_EQ(bias->dims().size(),
+  if (bias) {
+    PADDLE_ENFORCE_EQ(bias.dims().size(),
                       1,
                       phi::errors::InvalidArgument(
                           "The dimensions of Input(Bias) must be 1, but "
                           "received dimensions of"
                           "Input(Bias) is [%d]",
-                          bias->dims().size()));
+                          bias.dims().size()));
   }
-  if (config.is_runtime && bias.get_ptr() != nullptr) {
+  if (config.is_runtime && bias) {
     PADDLE_ENFORCE_EQ(
-        bias->dims()[0],
+        bias.dims()[0],
         right,
         phi::errors::InvalidArgument(
             "The first dimension value of Input(Bias) must equal to be the"
@@ -430,7 +428,7 @@ void LayerNormInferMeta(const MetaTensor& x,
             "But received the first dimension value of Input(Bias) is"
             "[%d], the second dimension value of the flattened 2D matrix of"
             " Input(Bias) is [%d].",
-            bias->dims()[0],
+            bias.dims()[0],
             right));
   }
 
@@ -445,19 +443,19 @@ void LayerNormInferMeta(const MetaTensor& x,
 }
 
 void LayerNormGradInferMeta(const MetaTensor& x,
-                            paddle::optional<const MetaTensor&> y,
-                            paddle::optional<const MetaTensor&> z,
+                            const MetaTensor& y,
+                            const MetaTensor& z,
                             MetaTensor* dx,
                             MetaTensor* dy,
                             MetaTensor* dz) {
   if (dx) {
     dx->share_meta(x);
   }
-  if (dy && (y.get_ptr() != nullptr)) {
-    dy->share_meta(*y.get_ptr());
+  if (dy && y) {
+    dy->share_meta(y);
   }
-  if (dz && (z.get_ptr() != nullptr)) {
-    dz->share_meta(*z.get_ptr());
+  if (dz && z) {
+    dz->share_meta(z);
   }
 }
 
@@ -517,7 +515,7 @@ void LinspaceInferMeta(const MetaTensor& start,
 
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
-                         paddle::optional<const MetaTensor&> weight,
+                         const MetaTensor& weight,
                          int64_t ignore_index,
                          const std::string& reduction,
                          MetaTensor* out,
@@ -542,8 +540,8 @@ void NllLossRawInferMeta(const MetaTensor& input,
             " batch_size is [%s].",
             x_dims[0],
             label_dims[0]));
-    if (weight.get_ptr() != nullptr) {
-      auto w_dims = weight->dims();
+    if (weight) {
+      auto w_dims = weight.dims();
       PADDLE_ENFORCE_EQ(
           w_dims.size(),
           1,
@@ -607,7 +605,7 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
 
 void RoiAlignInferMeta(const MetaTensor& x,
                        const MetaTensor& boxes,
-                       paddle::optional<const MetaTensor&> boxes_num,
+                       const MetaTensor& boxes_num,
                        int pooled_height,
                        int pooled_width,
                        float spatial_scale,
@@ -619,7 +617,7 @@ void RoiAlignInferMeta(const MetaTensor& x,
   auto boxes_dims = boxes.dims();
 
   if (boxes_num) {
-    auto boxes_num_dims = boxes_num->dims();
+    auto boxes_num_dims = boxes_num.dims();
     PADDLE_ENFORCE_EQ(
         boxes_num_dims.size(),
         1,
@@ -684,7 +682,7 @@ void RoiAlignInferMeta(const MetaTensor& x,
 
 void RoiPoolInferMeta(const MetaTensor& x,
                       const MetaTensor& boxes,
-                      paddle::optional<const MetaTensor&> boxes_num,
+                      const MetaTensor& boxes_num,
                       int pooled_height,
                       int pooled_width,
                       float spatial_scale,
@@ -694,7 +692,7 @@ void RoiPoolInferMeta(const MetaTensor& x,
   auto boxes_dims = boxes.dims();
 
   if (boxes_num) {
-    auto boxes_num_dims = boxes_num->dims();
+    auto boxes_num_dims = boxes_num.dims();
     PADDLE_ENFORCE_EQ(
         boxes_num_dims.size(),
         1,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index b2fb30a4da2d6..760011ad829fc 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -53,8 +53,8 @@ void ArangeInferMeta(const MetaTensor& start,
                      MetaTensor* out);
 
 void InstanceNormInferMeta(const MetaTensor& x,
-                           paddle::optional<const MetaTensor&> scale,
-                           paddle::optional<const MetaTensor&> bias,
+                           const MetaTensor& scale,
+                           const MetaTensor& bias,
                            float epsilon,
                            MetaTensor* y,
                            MetaTensor* saved_mean,
@@ -70,8 +70,8 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             MetaTensor* dst_count);
 
 void LayerNormInferMeta(const MetaTensor& x,
-                        paddle::optional<const MetaTensor&> scale,
-                        paddle::optional<const MetaTensor&> bias,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
                         float epsilon,
                         int begin_norm_axis,
                         bool is_test,
@@ -81,8 +81,8 @@ void LayerNormInferMeta(const MetaTensor& x,
                         MetaConfig config = MetaConfig());
 
 void LayerNormGradInferMeta(const MetaTensor& x,
-                            paddle::optional<const MetaTensor&> y,
-                            paddle::optional<const MetaTensor&> z,
+                            const MetaTensor& y,
+                            const MetaTensor& z,
                             MetaTensor* dx,
                             MetaTensor* dy,
                             MetaTensor* dz);
@@ -105,7 +105,7 @@ void LinspaceInferMeta(const MetaTensor& start,
 
 void NllLossRawInferMeta(const MetaTensor& input,
                          const MetaTensor& label,
-                         paddle::optional<const MetaTensor&> weight,
+                         const MetaTensor& weight,
                          int64_t ignore_index,
                          const std::string& reduction,
                          MetaTensor* out,
@@ -121,7 +121,7 @@ void PutAlongAxisInferMeta(const MetaTensor& x,
 
 void RoiAlignInferMeta(const MetaTensor& x,
                        const MetaTensor& boxes,
-                       paddle::optional<const MetaTensor&> boxes_num,
+                       const MetaTensor& boxes_num,
                        int pooled_height,
                        int pooled_width,
                        float spatial_scale,
@@ -132,7 +132,7 @@ void RoiAlignInferMeta(const MetaTensor& x,
 
 void RoiPoolInferMeta(const MetaTensor& x,
                       const MetaTensor& boxes,
-                      paddle::optional<const MetaTensor&> boxes_num,
+                      const MetaTensor& boxes_num,
                       int pooled_height,
                       int pooled_width,
                       float spatial_scale,
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index 5d7af6cca947a..8e63a0fd22ade 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -137,7 +137,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              const DenseTensor& dout,
                              const DenseTensor& ddx,
                              const DenseTensor& d_dout_new,
-                             paddle::optional<const DenseTensor&> d_ddout,
+                             const paddle::optional<DenseTensor>& d_ddout,
                              DenseTensor* d_out_new,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx);
diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h
index f144d40d2b666..0bdf05f8e5123 100644
--- a/paddle/phi/kernels/adam_kernel.h
+++ b/paddle/phi/kernels/adam_kernel.h
@@ -28,8 +28,8 @@ void AdamDenseKernel(const Context& dev_ctx,
                      const DenseTensor& moment2,
                      const DenseTensor& beta1_pow,
                      const DenseTensor& beta2_pow,
-                     paddle::optional<const DenseTensor&> master_param,
-                     paddle::optional<const DenseTensor&> skip_update,
+                     const paddle::optional<DenseTensor>& master_param,
+                     const paddle::optional<DenseTensor>& skip_update,
                      const Scalar& beta1,
                      const Scalar& beta2,
                      const Scalar& epsilon,
diff --git a/paddle/phi/kernels/adamw_kernel.h b/paddle/phi/kernels/adamw_kernel.h
index d7b072adda4a2..5cbb38143ff6f 100644
--- a/paddle/phi/kernels/adamw_kernel.h
+++ b/paddle/phi/kernels/adamw_kernel.h
@@ -28,8 +28,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& moment2,
                       const DenseTensor& beta1_pow,
                       const DenseTensor& beta2_pow,
-                      paddle::optional<const DenseTensor&> master_param,
-                      paddle::optional<const DenseTensor&> skip_update,
+                      const paddle::optional<DenseTensor>& master_param,
+                      const paddle::optional<DenseTensor>& skip_update,
                       const Scalar& beta1,
                       const Scalar& beta2,
                       const Scalar& epsilon,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 720ebb5b78c9a..5ed9d72a503a5 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -31,7 +31,7 @@ void AssignKernel(const Context& dev_ctx,
 
 template <typename Context>
 void AssignRawKernel(const Context& dev_ctx,
-                     paddle::optional<const DenseTensor&> x,
+                     const paddle::optional<DenseTensor>& x,
                      DenseTensor* out) {
   if (x) {
     if (!x->IsInitialized()) {
diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h
index 6881ac9f0ee22..0294dc950deb1 100644
--- a/paddle/phi/kernels/assign_kernel.h
+++ b/paddle/phi/kernels/assign_kernel.h
@@ -31,7 +31,7 @@ void AssignKernel(const Context& dev_ctx,
 // this looks weird
 template <typename Context>
 void AssignRawKernel(const Context& dev_ctx,
-                     paddle::optional<const DenseTensor&> x,
+                     const paddle::optional<DenseTensor>& x,
                      DenseTensor* out);
 
 template <typename Context>
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
index 2cb3b16a022b1..3de2f69f452db 100644
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -24,11 +24,11 @@ void BatchNormGradRawKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& scale,
                             const DenseTensor& bias,
-                            paddle::optional<const DenseTensor&> mean,
-                            paddle::optional<const DenseTensor&> variance,
+                            const paddle::optional<DenseTensor>& mean,
+                            const paddle::optional<DenseTensor>& variance,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
-                            paddle::optional<const DenseTensor&> reserve_space,
+                            const paddle::optional<DenseTensor>& reserve_space,
                             const DenseTensor& y_grad,
                             float momentum,
                             float epsilon,
@@ -47,11 +47,11 @@ void BatchNormGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
                          const DenseTensor& scale,
                          const DenseTensor& bias,
-                         paddle::optional<const DenseTensor&> mean,
-                         paddle::optional<const DenseTensor&> variance,
+                         const paddle::optional<DenseTensor>& mean,
+                         const paddle::optional<DenseTensor>& variance,
                          const DenseTensor& saved_mean,
                          const DenseTensor& saved_variance,
-                         paddle::optional<const DenseTensor&> reserve_space,
+                         const paddle::optional<DenseTensor>& reserve_space,
                          const DenseTensor& y_grad,
                          float momentum,
                          float epsilon,
@@ -68,8 +68,8 @@ template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               paddle::optional<const DenseTensor&> mean,
-                               paddle::optional<const DenseTensor&> variance,
+                               const paddle::optional<DenseTensor>& mean,
+                               const paddle::optional<DenseTensor>& variance,
                                const DenseTensor& saved_mean,
                                const DenseTensor& saved_variance,
                                const DenseTensor& y_grad,
diff --git a/paddle/phi/kernels/bilinear_tensor_product_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
index b34e8946ddd58..bd01ed94868e2 100644
--- a/paddle/phi/kernels/bilinear_tensor_product_kernel.h
+++ b/paddle/phi/kernels/bilinear_tensor_product_kernel.h
@@ -24,7 +24,7 @@ void BilinearTensorProductKernel(const Context& dev_ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  const DenseTensor& weight,
-                                 paddle::optional<const DenseTensor&> bias,
+                                 const paddle::optional<DenseTensor>& bias,
                                  DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h
index 3ba69d365480f..e110b6e014b4d 100644
--- a/paddle/phi/kernels/bincount_kernel.h
+++ b/paddle/phi/kernels/bincount_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void BincountKernel(const Context& dev_ctx,
                     const DenseTensor& x,
-                    const paddle::optional<const DenseTensor&> weights,
+                    const paddle::optional<DenseTensor>& weights,
                     int minlength,
                     DenseTensor* out);
 
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
index 0a359d778a681..799c8721c3cff 100644
--- a/paddle/phi/kernels/conv_grad_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -23,8 +23,8 @@ void ConvGradGradKernel(const Context& dev_ctx,
                         const DenseTensor& input,
                         const DenseTensor& filter,
                         const DenseTensor& out_grad,
-                        paddle::optional<const DenseTensor&> input_grad_grad,
-                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const paddle::optional<DenseTensor>& input_grad_grad,
+                        const paddle::optional<DenseTensor>& filter_grad_grad,
                         const std::vector<int>& strides,
                         const std::vector<int>& paddings,
                         const std::string& paddding_algorithm,
@@ -40,8 +40,8 @@ void ConvGradGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& dev_ctx,
-                          paddle::optional<const DenseTensor&> input_grad_grad,
-                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc
index 1e0f5c4df9fd6..339d690310f45 100644
--- a/paddle/phi/kernels/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/cpu/adam_kernel.cc
@@ -36,8 +36,8 @@ void AdamDenseKernel(const Context& dev_ctx,
                      const DenseTensor& moment2,
                      const DenseTensor& beta1_pow,
                      const DenseTensor& beta2_pow,
-                     paddle::optional<const DenseTensor&> master_param,
-                     paddle::optional<const DenseTensor&> skip_update,
+                     const paddle::optional<DenseTensor>& master_param,
+                     const paddle::optional<DenseTensor>& skip_update,
                      const Scalar& beta1,
                      const Scalar& beta2,
                      const Scalar& epsilon,
diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc
index f2c98fded4d4f..93092133291af 100644
--- a/paddle/phi/kernels/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/cpu/adamw_kernel.cc
@@ -35,8 +35,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& moment2,
                       const DenseTensor& beta1_pow,
                       const DenseTensor& beta2_pow,
-                      paddle::optional<const DenseTensor&> master_param,
-                      paddle::optional<const DenseTensor&> skip_update,
+                      const paddle::optional<DenseTensor>& master_param,
+                      const paddle::optional<DenseTensor>& skip_update,
                       const Scalar& beta1,
                       const Scalar& beta2,
                       const Scalar& epsilon,
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index bf01c24f4ffa3..366a08e59fee3 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -41,11 +41,11 @@ void BatchNormGradRawKernel(const Context& ctx,
                             const DenseTensor& x,
                             const DenseTensor& scale,
                             const DenseTensor& bias,
-                            paddle::optional<const DenseTensor&> mean,
-                            paddle::optional<const DenseTensor&> variance,
+                            const paddle::optional<DenseTensor>& mean,
+                            const paddle::optional<DenseTensor>& variance,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
-                            paddle::optional<const DenseTensor&> reserve_space,
+                            const paddle::optional<DenseTensor>& reserve_space,
                             const DenseTensor& y_grad,
                             float momentum,
                             float epsilon,
@@ -300,11 +300,11 @@ void BatchNormGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
                          const DenseTensor& scale,
                          const DenseTensor& bias,
-                         paddle::optional<const DenseTensor&> mean,
-                         paddle::optional<const DenseTensor&> variance,
+                         const paddle::optional<DenseTensor>& mean,
+                         const paddle::optional<DenseTensor>& variance,
                          const DenseTensor& saved_mean,
                          const DenseTensor& saved_variance,
-                         paddle::optional<const DenseTensor&> reserve_space,
+                         const paddle::optional<DenseTensor>& reserve_space,
                          const DenseTensor& y_grad,
                          float momentum,
                          float epsilon,
@@ -343,8 +343,8 @@ template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& ctx,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               paddle::optional<const DenseTensor&> mean,
-                               paddle::optional<const DenseTensor&> variance,
+                               const paddle::optional<DenseTensor>& mean,
+                               const paddle::optional<DenseTensor>& variance,
                                const DenseTensor& saved_mean,
                                const DenseTensor& saved_variance,
                                const DenseTensor& y_grad,
diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc
index c9dc44c1e04eb..8163953c1e00e 100644
--- a/paddle/phi/kernels/cpu/bincount_kernel.cc
+++ b/paddle/phi/kernels/cpu/bincount_kernel.cc
@@ -23,7 +23,7 @@ namespace phi {
 template <typename Context, typename T, typename InputT>
 void BincountInner(const Context& dev_ctx,
                    const DenseTensor& x,
-                   const paddle::optional<const DenseTensor&> weights,
+                   const paddle::optional<DenseTensor>& weights,
                    int minlength,
                    DenseTensor* out) {
   const DenseTensor* input = &x;
@@ -85,7 +85,7 @@ void BincountInner(const Context& dev_ctx,
 template <typename T, typename Context>
 void BincountKernel(const Context& dev_ctx,
                     const DenseTensor& x,
-                    const paddle::optional<const DenseTensor&> weights,
+                    const paddle::optional<DenseTensor>& weights,
                     int minlength,
                     DenseTensor* out) {
   if (x.dtype() == DataType::INT32) {
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
index 4966c998dd37d..c52f2614150d8 100644
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -21,8 +21,8 @@
 namespace phi {
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& ctx,
-                          paddle::optional<const DenseTensor&> input_grad_grad,
-                          paddle::optional<const DenseTensor&> filter_grad_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
index c00aedef8c67d..fa12e505e4209 100644
--- a/paddle/phi/kernels/cpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      paddle::optional<const DenseTensor&> seed_tensor,
+                      const paddle::optional<DenseTensor>& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
diff --git a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
index f8a89b997b413..434866b840cc3 100644
--- a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
@@ -63,8 +63,8 @@ template <typename T, typename Context>
 void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& y,
                          const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout) {
   phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
index b86ead04dbc5f..03bb47aaa97b3 100644
--- a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -39,8 +39,8 @@ template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
index d6454b4796430..c0a88f3222717 100644
--- a/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_reindex_kernel.cc
@@ -27,8 +27,8 @@ void GraphReindexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& neighbors,
                         const DenseTensor& count,
-                        paddle::optional<const DenseTensor&> hashtable_value,
-                        paddle::optional<const DenseTensor&> hashtable_index,
+                        const paddle::optional<DenseTensor>& hashtable_value,
+                        const paddle::optional<DenseTensor>& hashtable_index,
                         bool flag_buffer_hashtable,
                         DenseTensor* reindex_src,
                         DenseTensor* reindex_dst,
diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
index b4321a85ab2ee..70aac053417b8 100644
--- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
@@ -167,8 +167,8 @@ void GraphSampleNeighborsKernel(
     const DenseTensor& row,
     const DenseTensor& col_ptr,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> eids,
-    paddle::optional<const DenseTensor&> perm_buffer,
+    const paddle::optional<DenseTensor>& eids,
+    const paddle::optional<DenseTensor>& perm_buffer,
     int sample_size,
     bool return_eids,
     bool flag_perm_buffer,
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
index 95eeb64afea20..6ea65d005c1ad 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -121,8 +121,8 @@ void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& x,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
-                             paddle::optional<const DenseTensor&> out,
-                             paddle::optional<const DenseTensor&> dst_count,
+                             const paddle::optional<DenseTensor>& out,
+                             const paddle::optional<DenseTensor>& dst_count,
                              const DenseTensor& out_grad,
                              const std::string& pool_type,
                              DenseTensor* x_grad) {
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
index cc67f8e7f210c..9b38095f25f75 100644
--- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h
@@ -31,9 +31,9 @@ void HierarchicalSigmoidGradKernelImpl(
     const DenseTensor& x,
     const DenseTensor& w,
     const DenseTensor& label,
-    paddle::optional<const DenseTensor&> path,
-    paddle::optional<const DenseTensor&> code,
-    paddle::optional<const DenseTensor&> bias,
+    const paddle::optional<DenseTensor>& path,
+    const paddle::optional<DenseTensor>& code,
+    const paddle::optional<DenseTensor>& bias,
     const DenseTensor& pre_out,
     const DenseTensor& out_grad,
     int num_classes,
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
index 9edc9f87d4b1f..eee4525293f3f 100644
--- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc
@@ -25,9 +25,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
index 4c4f1aa125a33..7c3421e88d449 100644
--- a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
+++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc
@@ -32,9 +32,9 @@ void HierarchicalSigmoidKernel(const Context& ctx,
                                const DenseTensor& x,
                                const DenseTensor& w,
                                const DenseTensor& label,
-                               paddle::optional<const DenseTensor&> path,
-                               paddle::optional<const DenseTensor&> code,
-                               paddle::optional<const DenseTensor&> bias,
+                               const paddle::optional<DenseTensor>& path,
+                               const paddle::optional<DenseTensor>& code,
+                               const paddle::optional<DenseTensor>& bias,
                                int num_classes,
                                bool remote_prefetch,
                                int trainer_id,
diff --git a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
index dcb4289ae8d75..340d2907a7909 100644
--- a/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_grad_kernel.cc
@@ -43,7 +43,7 @@ template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& d_y,
-                            paddle::optional<const DenseTensor&> scale,
+                            const paddle::optional<DenseTensor>& scale,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
                             float epsilon,
@@ -153,13 +153,13 @@ void InstanceNormGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void InstanceNormDoubleGradKernel(const Context& dev_ctx,
                                   const DenseTensor& x,
-                                  paddle::optional<const DenseTensor&> scale,
+                                  const paddle::optional<DenseTensor>& scale,
                                   const DenseTensor& saved_mean,
                                   const DenseTensor& saved_variance,
                                   const DenseTensor& dy,
-                                  paddle::optional<const DenseTensor&> ddx,
-                                  paddle::optional<const DenseTensor&> ddscale,
-                                  paddle::optional<const DenseTensor&> ddbias,
+                                  const paddle::optional<DenseTensor>& ddx,
+                                  const paddle::optional<DenseTensor>& ddscale,
+                                  const paddle::optional<DenseTensor>& ddbias,
                                   float epsilon,
                                   DenseTensor* dx,
                                   DenseTensor* dscale,
diff --git a/paddle/phi/kernels/cpu/instance_norm_kernel.cc b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
index f89ecba901c04..5eac473effa0e 100644
--- a/paddle/phi/kernels/cpu/instance_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/instance_norm_kernel.cc
@@ -30,8 +30,8 @@ namespace phi {
 template <typename T, typename Context>
 void InstanceNormKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                        paddle::optional<const DenseTensor&> scale,
-                        paddle::optional<const DenseTensor&> bias,
+                        const paddle::optional<DenseTensor>& scale,
+                        const paddle::optional<DenseTensor>& bias,
                         float epsilon_f,
                         DenseTensor* y,
                         DenseTensor* saved_mean,
diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
index 550439a5251db..d4e13aa3b24fe 100644
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -361,9 +361,9 @@ template <typename T, typename Context>
 static void Interpolate1DCPUBwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_w,
@@ -459,9 +459,9 @@ template <typename T, typename Context>
 static void Interpolate2DCPUBwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_h,
@@ -619,9 +619,9 @@ template <typename T, typename Context>
 static void Interpolate3DCPUBwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_d,
@@ -800,9 +800,9 @@ template <typename T, typename Context>
 void InterpolateGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout,
     int out_d,
@@ -867,9 +867,9 @@ template <typename T, typename Context>
 void BilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -901,9 +901,9 @@ template <typename T, typename Context>
 void NearestInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -935,9 +935,9 @@ template <typename T, typename Context>
 void TrilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -969,9 +969,9 @@ template <typename T, typename Context>
 void LinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1003,9 +1003,9 @@ template <typename T, typename Context>
 void BicubicInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc
index da9a54748f06f..5259a770568e4 100644
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -504,9 +504,9 @@ template <typename T, typename Context>
 static void Interpolate1DCPUFwd(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_w,
     const std::vector<float>& scale,
@@ -603,9 +603,9 @@ template <typename T, typename Context>
 static void Interpolate2DCPUFwd(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_h,
     int out_w,
@@ -770,9 +770,9 @@ template <typename T, typename Context>
 static void Interpolate3DCPUFwd(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -966,9 +966,9 @@ template <typename T, typename Context>
 void InterpolateKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1029,9 +1029,9 @@ template <typename T, typename Context>
 void BilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1061,9 +1061,9 @@ template <typename T, typename Context>
 void NearestInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1093,9 +1093,9 @@ template <typename T, typename Context>
 void TrilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1125,9 +1125,9 @@ template <typename T, typename Context>
 void LinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1157,9 +1157,9 @@ template <typename T, typename Context>
 void BicubicInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
index c76fb826cdfcc..af9548e8186bc 100644
--- a/paddle/phi/kernels/cpu/label_smooth_kernel.cc
+++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc
@@ -22,7 +22,7 @@ namespace phi {
 template <typename T, typename Context>
 void LabelSmoothKernel(const Context& ctx,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> prior_dist,
+                       const paddle::optional<DenseTensor>& prior_dist,
                        float epsilon,
                        DenseTensor* out) {
   auto label_dim = label.dims()[label.dims().size() - 1];
diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
index 7c1b33f047b61..a30f54fd4b60e 100644
--- a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc
@@ -32,8 +32,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                         paddle::optional<const DenseTensor&> scale_opt,
-                         paddle::optional<const DenseTensor&> bias_opt,
+                         const paddle::optional<DenseTensor>& scale_opt,
+                         const paddle::optional<DenseTensor>& bias_opt,
                          const DenseTensor& mean,
                          const DenseTensor& variance,
                          const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
index 5b09d68c7ca08..52722468e16bd 100644
--- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc
@@ -30,8 +30,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     paddle::optional<const DenseTensor&> scale_opt,
-                     paddle::optional<const DenseTensor&> bias_opt,
+                     const paddle::optional<DenseTensor>& scale_opt,
+                     const paddle::optional<DenseTensor>& bias_opt,
                      float epsilon,
                      int begin_norm_axis,
                      bool is_test,
diff --git a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
index 5b859b6ec270e..dd2b09ee39acb 100644
--- a/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_grad_kernel.cc
@@ -121,7 +121,7 @@ template <typename T, typename Context>
 void NllLossGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& labels,
-                       paddle::optional<const DenseTensor&> weight,
+                       const paddle::optional<DenseTensor>& weight,
                        const DenseTensor& total_weight,
                        const DenseTensor& d_out,
                        int64_t ignore_index,
diff --git a/paddle/phi/kernels/cpu/nll_loss_kernel.cc b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
index 334b0082bde57..92cb6a1ad17de 100644
--- a/paddle/phi/kernels/cpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/nll_loss_kernel.cc
@@ -154,7 +154,7 @@ template <typename T, typename Context>
 void NllLossRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& labels,
-                      paddle::optional<const DenseTensor&> weight,
+                      const paddle::optional<DenseTensor>& weight,
                       int64_t ignore_index,
                       const std::string& reduction,
                       DenseTensor* out,
diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
index 715e6b008ed77..b68c3ad545d33 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void PsroiPoolGradKernel(const Context& ctx,
                          const DenseTensor& x,
                          const DenseTensor& rois,
-                         paddle::optional<const DenseTensor&> rois_num,
+                         const paddle::optional<DenseTensor>& rois_num,
                          const DenseTensor& dout,
                          int pooled_height,
                          int pooled_width,
diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
index 06cd03395d965..4f7925ad00f5a 100644
--- a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void PsroiPoolKernel(const Context& ctx,
                      const DenseTensor& x,
                      const DenseTensor& rois,
-                     paddle::optional<const DenseTensor&> rois_num,
+                     const paddle::optional<DenseTensor>& rois_num,
                      int pooled_height,
                      int pooled_width,
                      int output_channels,
diff --git a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
index 9b5e5cb5443b1..4dd1894320af7 100644
--- a/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_grad_kernel.cc
@@ -969,7 +969,7 @@ void RnnGradFunc(const CPUContext& dev_ctx,
                  const DenseTensor& x,
                  const std::vector<const DenseTensor*>& pre_state,
                  const std::vector<const DenseTensor*>& weight_list,
-                 paddle::optional<const DenseTensor&> sequence_length,
+                 const paddle::optional<DenseTensor>& sequence_length,
                  const DenseTensor& out,
                  const DenseTensor& dropout_state,
                  const DenseTensor& reserve,
@@ -1244,7 +1244,7 @@ void RnnGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<const DenseTensor*>& pre_state,
                    const std::vector<const DenseTensor*>& weight_list,
-                   paddle::optional<const DenseTensor&> sequence_length,
+                   const paddle::optional<DenseTensor>& sequence_length,
                    const DenseTensor& out,
                    const DenseTensor& dropout_state,
                    const DenseTensor& reserve,
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index ae2c7a72635f7..80c521918ed07 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -819,7 +819,7 @@ void RnnKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const std::vector<const DenseTensor*>& pre_state,
                const std::vector<const DenseTensor*>& weight_list,
-               paddle::optional<const DenseTensor&> sequence_length,
+               const paddle::optional<DenseTensor>& sequence_length,
                float dropout_prob,
                bool is_bidirec,
                int input_size,
diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
index a91b8b6c1fcd3..ea01121509f1a 100644
--- a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc
@@ -73,7 +73,7 @@ template <typename T, typename Context>
 void RoiAlignGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& boxes,
-                        paddle::optional<const DenseTensor&> boxes_num,
+                        const paddle::optional<DenseTensor>& boxes_num,
                         const DenseTensor& out_grad,
                         int pooled_height,
                         int pooled_width,
diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc
index 4752a9b3a48fd..cd779b72e7a84 100644
--- a/paddle/phi/kernels/cpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc
@@ -182,7 +182,7 @@ template <typename T, typename Context>
 void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
-                    paddle::optional<const DenseTensor&> boxes_num,
+                    const paddle::optional<DenseTensor>& boxes_num,
                     int pooled_height,
                     int pooled_width,
                     float spatial_scale,
diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
index 0eaa873590eb0..f2fcfa5648d3f 100644
--- a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc
@@ -25,7 +25,7 @@ template <typename T, typename Context>
 void RoiPoolGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& boxes,
-                       paddle::optional<const DenseTensor&> boxes_num,
+                       const paddle::optional<DenseTensor>& boxes_num,
                        const DenseTensor& arg_max,
                        const DenseTensor& out_grad,
                        int pooled_height,
diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
index 02020354cd357..e088e9a2831cb 100644
--- a/paddle/phi/kernels/cpu/roi_pool_kernel.cc
+++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void RoiPoolKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
-                   paddle::optional<const DenseTensor&> boxes_num,
+                   const paddle::optional<DenseTensor>& boxes_num,
                    int pooled_height,
                    int pooled_width,
                    float spatial_scale,
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
index c7b4074c70aaa..214fd82bef358 100644
--- a/paddle/phi/kernels/cpu/sgd_kernel.cc
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -118,7 +118,7 @@ void SGDDenseKernel(const Context& dev_ctx,
                     const DenseTensor& param,
                     const DenseTensor& learning_rate,
                     const DenseTensor& grad,
-                    paddle::optional<const DenseTensor&> master_param,
+                    const paddle::optional<DenseTensor>& master_param,
                     bool multi_precision,
                     DenseTensor* param_out,
                     DenseTensor* master_param_out) {
@@ -132,7 +132,7 @@ void SGDDenseParamSparseGradKernel(
     const DenseTensor& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const DenseTensor&> master_param,
+    const paddle::optional<DenseTensor>& master_param,
     bool multi_precision,
     DenseTensor* param_out,
     DenseTensor* master_param_out) {
@@ -146,7 +146,7 @@ void SGDSparseParamSparseGradKernel(
     const SelectedRows& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const SelectedRows&> master_param,
+    const paddle::optional<SelectedRows>& master_param,
     bool multi_precision,
     SelectedRows* param_out,
     SelectedRows* master_param_out) {
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
index acd9a99cef4de..383009229f9a1 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_grad_kernel.cc
@@ -121,7 +121,7 @@ void Yolov3LossGradKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& gt_box,
                           const DenseTensor& gt_label,
-                          paddle::optional<const DenseTensor&> gt_score,
+                          const paddle::optional<DenseTensor>& gt_score,
                           const DenseTensor& loss_grad,
                           const DenseTensor& objectness_mask,
                           const DenseTensor& gt_match_mask,
diff --git a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
index 6df910eea02a9..8a190ab25a7b2 100644
--- a/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/yolov3_loss_kernel.cc
@@ -182,7 +182,7 @@ void Yolov3LossKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& gt_box,
                       const DenseTensor& gt_label,
-                      paddle::optional<const DenseTensor&> gt_score,
+                      const paddle::optional<DenseTensor>& gt_score,
                       const std::vector<int>& anchors,
                       const std::vector<int>& anchor_mask,
                       int class_num,
diff --git a/paddle/phi/kernels/deformable_conv_grad_kernel.h b/paddle/phi/kernels/deformable_conv_grad_kernel.h
index 85786cec4c3e5..04fe7904a4509 100644
--- a/paddle/phi/kernels/deformable_conv_grad_kernel.h
+++ b/paddle/phi/kernels/deformable_conv_grad_kernel.h
@@ -23,7 +23,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& offset,
                               const DenseTensor& filter,
-                              paddle::optional<const DenseTensor&> mask,
+                              const paddle::optional<DenseTensor>& mask,
                               const DenseTensor& out_grad,
                               const std::vector<int>& strides,
                               const std::vector<int>& paddings,
diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h
index fbbe5f62c6a29..7b66e506b8928 100644
--- a/paddle/phi/kernels/deformable_conv_kernel.h
+++ b/paddle/phi/kernels/deformable_conv_kernel.h
@@ -24,7 +24,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& offset,
                           const DenseTensor& filter,
-                          paddle::optional<const DenseTensor&> mask,
+                          const paddle::optional<DenseTensor>& mask,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::vector<int>& dilations,
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
index dc9f89e08e17a..6febcd78e1107 100644
--- a/paddle/phi/kernels/dropout_kernel.h
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -22,7 +22,7 @@ namespace phi {
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      paddle::optional<const DenseTensor&> seed_tensor,
+                      const paddle::optional<DenseTensor>& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
diff --git a/paddle/phi/kernels/elementwise_add_grad_kernel.h b/paddle/phi/kernels/elementwise_add_grad_kernel.h
index 9b754cfefe365..8fc31c8878b6a 100644
--- a/paddle/phi/kernels/elementwise_add_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_add_grad_kernel.h
@@ -32,8 +32,8 @@ template <typename T, typename Context>
 void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& y,
                          const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout);
 
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
index 6d29dae99a131..c764f05c3983f 100644
--- a/paddle/phi/kernels/elementwise_divide_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -34,8 +34,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
                             const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             int axis,
                             DenseTensor* dy,
                             DenseTensor* dout,
diff --git a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
index 517948a50d1b1..9cbd5040666cf 100644
--- a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
@@ -33,8 +33,8 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* dx,
                               DenseTensor* dy,
@@ -45,11 +45,11 @@ void MultiplyTripleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               const DenseTensor& d_dx,
                               const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
+                              const paddle::optional<DenseTensor>& d_ddout,
                               int axis,
                               DenseTensor* d_x,
                               DenseTensor* d_y,
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
index 97df769f4d046..536d859b46a7b 100644
--- a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -31,8 +31,8 @@ template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout);
 
diff --git a/paddle/phi/kernels/expand_as_kernel.h b/paddle/phi/kernels/expand_as_kernel.h
index 971ea32310f3e..6bc6c73e737d7 100644
--- a/paddle/phi/kernels/expand_as_kernel.h
+++ b/paddle/phi/kernels/expand_as_kernel.h
@@ -21,7 +21,7 @@ namespace phi {
 template <typename T, typename Context>
 void ExpandAsKernel(const Context& ctx,
                     const DenseTensor& x,
-                    paddle::optional<const DenseTensor&> y,
+                    const paddle::optional<DenseTensor>& y,
                     const std::vector<int>& target_shape,
                     DenseTensor* out);
 
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 417c1cd234754..b0e68abc08a57 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -170,7 +170,7 @@ template <typename T, typename PoolProcess>
 __global__ void KernelPool2DGrad(const int nthreads,
                                  const T* __restrict__ input_data,
                                  const T* __restrict__ output_data,
-                                 const const T* __restrict__ output_grad,
+                                 const T* __restrict__ output_grad,
                                  const int output_width,
                                  const int output_height,
                                  const int input_width,
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
index fbd744430aa11..e6bd371935622 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cc
+++ b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -90,7 +90,7 @@ class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
                   const DenseTensor& out_grad,
                   const DenseTensor& segments,
                   DenseTensor* in_grad,
-                  paddle::optional<const DenseTensor&> index,
+                  const paddle::optional<DenseTensor>& index,
                   const std::string pooltype = "SUM") {
     const IndexT* segment_ids = segments.data<IndexT>();
     auto& place = *dev_ctx.eigen_device();
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
index 95606b1526729..687cccb1f64f9 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.cu
+++ b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -417,7 +417,7 @@ class SegmentPoolGradFunctor<phi::GPUContext, T, IndexT> {
                   const DenseTensor& out_grad,
                   const DenseTensor& segments,
                   DenseTensor* in_grad,
-                  paddle::optional<const DenseTensor&> summed_ids,
+                  const paddle::optional<DenseTensor>& summed_ids,
                   const std::string pooltype = "SUM") {
     if (pooltype == "MAX" || pooltype == "MIN") {
       SegmentPoolCUDAGradFunctor<T, IndexT>(
diff --git a/paddle/phi/kernels/funcs/segment_pooling.h b/paddle/phi/kernels/funcs/segment_pooling.h
index b8281061582ea..09da9eb304773 100644
--- a/paddle/phi/kernels/funcs/segment_pooling.h
+++ b/paddle/phi/kernels/funcs/segment_pooling.h
@@ -41,7 +41,7 @@ class SegmentPoolGradFunctor {
                   const DenseTensor& out_grad,
                   const DenseTensor& segments,
                   DenseTensor* in_grad,
-                  paddle::optional<const DenseTensor&> summed_ids,
+                  const paddle::optional<DenseTensor>& summed_ids,
                   const std::string pooltype = "SUM");
 };
 
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 33b6f3a5a1bee..449aaae1a4be4 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -135,8 +135,8 @@ void AdamDenseKernel(const Context& dev_ctx,
                      const DenseTensor& moment2,
                      const DenseTensor& beta1_pow,
                      const DenseTensor& beta2_pow,
-                     paddle::optional<const DenseTensor&> master_param,
-                     paddle::optional<const DenseTensor&> skip_update,
+                     const paddle::optional<DenseTensor>& master_param,
+                     const paddle::optional<DenseTensor>& skip_update,
                      const Scalar& beta1,
                      const Scalar& beta2,
                      const Scalar& epsilon,
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 4873ba9c13d48..0fff142567a5e 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -146,8 +146,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& moment2,
                       const DenseTensor& beta1_pow,
                       const DenseTensor& beta2_pow,
-                      paddle::optional<const DenseTensor&> master_param,
-                      paddle::optional<const DenseTensor&> skip_update,
+                      const paddle::optional<DenseTensor>& master_param,
+                      const paddle::optional<DenseTensor>& skip_update,
                       const Scalar& beta1,
                       const Scalar& beta2,
                       const Scalar& epsilon,
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index e808ef644a246..c08fa4eb260d4 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -309,11 +309,11 @@ void BatchNormGradRawKernel(const Context &ctx,
                             const DenseTensor &x,
                             const DenseTensor &scale,
                             const DenseTensor &bias,
-                            paddle::optional<const DenseTensor &> mean,
-                            paddle::optional<const DenseTensor &> variance,
+                            const paddle::optional<DenseTensor> &mean,
+                            const paddle::optional<DenseTensor> &variance,
                             const DenseTensor &saved_mean,
                             const DenseTensor &saved_variance,
-                            paddle::optional<const DenseTensor &> reserve_space,
+                            const paddle::optional<DenseTensor> &reserve_space,
                             const DenseTensor &y_grad,
                             float momentum,
                             float epsilon_f,
@@ -867,11 +867,11 @@ void BatchNormGradKernel(const Context &dev_ctx,
                          const DenseTensor &x,
                          const DenseTensor &scale,
                          const DenseTensor &bias,
-                         paddle::optional<const DenseTensor &> mean,
-                         paddle::optional<const DenseTensor &> variance,
+                         const paddle::optional<DenseTensor> &mean,
+                         const paddle::optional<DenseTensor> &variance,
                          const DenseTensor &saved_mean,
                          const DenseTensor &saved_variance,
-                         paddle::optional<const DenseTensor &> reserve_space,
+                         const paddle::optional<DenseTensor> &reserve_space,
                          const DenseTensor &y_grad,
                          float momentum,
                          float epsilon,
@@ -910,8 +910,8 @@ template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context &ctx,
                                const DenseTensor &x,
                                const DenseTensor &scale,
-                               paddle::optional<const DenseTensor &> mean,
-                               paddle::optional<const DenseTensor &> variance,
+                               const paddle::optional<DenseTensor> &mean,
+                               const paddle::optional<DenseTensor> &variance,
                                const DenseTensor &saved_mean,
                                const DenseTensor &saved_variance,
                                const DenseTensor &y_grad,
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index a4ec894790cd3..8e60b31c3706b 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -49,7 +49,7 @@ __global__ void KernelBincount(const InputT* input,
 template <typename Context, typename T, typename InputT>
 void BincountCUDAInner(const Context& dev_ctx,
                        const DenseTensor& x,
-                       const paddle::optional<const DenseTensor&> weights,
+                       const paddle::optional<DenseTensor>& weights,
                        int minlength,
                        DenseTensor* out) {
   const DenseTensor* input = &x;
@@ -143,7 +143,7 @@ void BincountCUDAInner(const Context& dev_ctx,
 template <typename T, typename Context>
 void BincountKernel(const Context& dev_ctx,
                     const DenseTensor& x,
-                    const paddle::optional<const DenseTensor&> weights,
+                    const paddle::optional<DenseTensor>& weights,
                     int minlength,
                     DenseTensor* out) {
   if (x.dtype() == DataType::INT32) {
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
index bd1683ad0c7d8..fae0e8cb25b5c 100644
--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      paddle::optional<const DenseTensor&> seed_tensor,
+                      const paddle::optional<DenseTensor>& seed_tensor,
                       float p,
                       bool is_test,
                       const std::string& mode,
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
index 8dd4d0184c267..517fbcba158b8 100644
--- a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -57,8 +57,8 @@ template <typename T, typename Context>
 void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& y,
                          const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
+                         const paddle::optional<DenseTensor>& ddx,
+                         const paddle::optional<DenseTensor>& ddy,
                          int axis,
                          DenseTensor* ddout) {
   phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
index 017616df2782c..45e19b9838405 100644
--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -47,8 +47,8 @@ template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index 34bd1d6db77da..9869d5a517bcb 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -286,8 +286,8 @@ void GraphReindexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& neighbors,
                         const DenseTensor& count,
-                        paddle::optional<const DenseTensor&> hashtable_value,
-                        paddle::optional<const DenseTensor&> hashtable_index,
+                        const paddle::optional<DenseTensor>& hashtable_value,
+                        const paddle::optional<DenseTensor>& hashtable_index,
                         bool flag_buffer_hashtable,
                         DenseTensor* reindex_src,
                         DenseTensor* reindex_dst,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index af616963b499a..174495dad34b2 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -356,8 +356,8 @@ void GraphSampleNeighborsKernel(
     const DenseTensor& row,
     const DenseTensor& col_ptr,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> eids,
-    paddle::optional<const DenseTensor&> perm_buffer,
+    const paddle::optional<DenseTensor>& eids,
+    const paddle::optional<DenseTensor>& perm_buffer,
     int sample_size,
     bool return_eids,
     bool flag_perm_buffer,
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
index 2be0caff79d64..8743b4e8a7408 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -105,8 +105,8 @@ void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& x,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
-                             paddle::optional<const DenseTensor&> out,
-                             paddle::optional<const DenseTensor&> dst_count,
+                             const paddle::optional<DenseTensor>& out,
+                             const paddle::optional<DenseTensor>& dst_count,
                              const DenseTensor& out_grad,
                              const std::string& pool_type,
                              DenseTensor* x_grad) {
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 387127de48dea..b72acc7073383 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -291,7 +291,7 @@ template <typename T, typename Context>
 void InstanceNormGradKernel(const Context &dev_ctx,
                             const DenseTensor &x,
                             const DenseTensor &d_y,
-                            paddle::optional<const DenseTensor &> scale,
+                            const paddle::optional<DenseTensor> &scale,
                             const DenseTensor &saved_mean,
                             const DenseTensor &saved_variance,
                             float epsilon_f,
@@ -516,13 +516,13 @@ void InstanceNormGradKernel(const Context &dev_ctx,
 template <typename T, typename Context>
 void InstanceNormDoubleGradKernel(const Context &dev_ctx,
                                   const DenseTensor &x,
-                                  paddle::optional<const DenseTensor &> scale,
+                                  const paddle::optional<DenseTensor> &scale,
                                   const DenseTensor &saved_mean,
                                   const DenseTensor &saved_variance,
                                   const DenseTensor &dy,
-                                  paddle::optional<const DenseTensor &> ddx,
-                                  paddle::optional<const DenseTensor &> ddscale,
-                                  paddle::optional<const DenseTensor &> ddbias,
+                                  const paddle::optional<DenseTensor> &ddx,
+                                  const paddle::optional<DenseTensor> &ddscale,
+                                  const paddle::optional<DenseTensor> &ddbias,
                                   float epsilon_f,
                                   DenseTensor *dx,
                                   DenseTensor *dscale,
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index 81d9400750190..b729223689809 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -26,8 +26,8 @@ namespace phi {
 template <typename T, typename Context>
 void InstanceNormKernel(const Context &dev_ctx,
                         const DenseTensor &x,
-                        paddle::optional<const DenseTensor &> scale,
-                        paddle::optional<const DenseTensor &> bias,
+                        const paddle::optional<DenseTensor> &scale,
+                        const paddle::optional<DenseTensor> &bias,
                         float epsilon_f,
                         DenseTensor *y,
                         DenseTensor *saved_mean,
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index 73334d9c38aa3..cd0f4e1493e5c 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -747,9 +747,9 @@ template <typename T, typename Context>
 static void Interpolate1DCUDABwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_w,
@@ -861,9 +861,9 @@ template <typename T, typename Context>
 static void Interpolate2DCUDABwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_h,
@@ -1124,9 +1124,9 @@ template <typename T, typename Context>
 static void Interpolate3DCUDABwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout_str,
     int out_d,
@@ -1334,9 +1334,9 @@ template <typename T, typename Context>
 void InterpolateGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& output_grad,
     const std::string& data_layout,
     int out_d,
@@ -1401,9 +1401,9 @@ template <typename T, typename Context>
 void BilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1435,9 +1435,9 @@ template <typename T, typename Context>
 void NearestInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1469,9 +1469,9 @@ template <typename T, typename Context>
 void TrilinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1503,9 +1503,9 @@ template <typename T, typename Context>
 void LinearInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
@@ -1537,9 +1537,9 @@ template <typename T, typename Context>
 void BicubicInterpGradKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu
index 6e609aa11674e..3bd59c807103c 100644
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -627,9 +627,9 @@ template <typename T, typename Context>
 static void Interpolate1DCUDAFwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_w,
     const std::vector<float>& scale,
@@ -742,9 +742,9 @@ template <typename T, typename Context>
 static void Interpolate2DCUDAFwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_h,
     int out_w,
@@ -997,9 +997,9 @@ template <typename T, typename Context>
 static void Interpolate3DCUDAFwd(
     const Context& dev_ctx,
     const DenseTensor& input,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout_str,
     int out_d,
     int out_h,
@@ -1221,9 +1221,9 @@ template <typename T, typename Context>
 void InterpolateKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1283,9 +1283,9 @@ template <typename T, typename Context>
 void BilinearInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1315,9 +1315,9 @@ template <typename T, typename Context>
 void NearestInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1347,9 +1347,9 @@ template <typename T, typename Context>
 void TrilinearInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1379,9 +1379,9 @@ template <typename T, typename Context>
 void LinearInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -1411,9 +1411,9 @@ template <typename T, typename Context>
 void BicubicInterpKernel(
     const Context& dev_ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
diff --git a/paddle/phi/kernels/gpu/label_smooth_kernel.cu b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
index 50f7548450ce7..bf7ac939eb389 100644
--- a/paddle/phi/kernels/gpu/label_smooth_kernel.cu
+++ b/paddle/phi/kernels/gpu/label_smooth_kernel.cu
@@ -53,7 +53,7 @@ __global__ void LabelSmoothRunDistKernel(const int N,
 template <typename T, typename Context>
 void LabelSmoothKernel(const Context& ctx,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> prior_dist,
+                       const paddle::optional<DenseTensor>& prior_dist,
                        float epsilon,
                        DenseTensor* out) {
   auto label_dim = label.dims()[label.dims().size() - 1];
diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
index 146d307a59380..961937441e1cf 100644
--- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormGradKernel(const Context &dev_ctx,
                          const DenseTensor &x,
-                         paddle::optional<const DenseTensor &> scale_opt,
-                         paddle::optional<const DenseTensor &> bias_opt,
+                         const paddle::optional<DenseTensor> &scale_opt,
+                         const paddle::optional<DenseTensor> &bias_opt,
                          const DenseTensor &mean,
                          const DenseTensor &variance,
                          const DenseTensor &out_grad,
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index d87b7c2193811..72127042c16e0 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -55,8 +55,8 @@ template class LayerNormDirectCUDAFunctor<float>;
 template <typename T, typename Context>
 void LayerNormKernel(const Context &dev_ctx,
                      const DenseTensor &x,
-                     paddle::optional<const DenseTensor &> scale_opt,
-                     paddle::optional<const DenseTensor &> bias_opt,
+                     const paddle::optional<DenseTensor> &scale_opt,
+                     const paddle::optional<DenseTensor> &bias_opt,
                      float epsilon,
                      int begin_norm_axis,
                      bool is_test,
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 43106ec1d863f..407f33c40089c 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void NllLossGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& labels,
-                       paddle::optional<const DenseTensor&> weight,
+                       const paddle::optional<DenseTensor>& weight,
                        const DenseTensor& total_weight,
                        const DenseTensor& dout,
                        int64_t ignore_index,
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 6b0e1fef7ba9a..99a8b10b11b5c 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void NllLossRawKernel(const Context& dev_ctx,
                       const DenseTensor& input,
                       const DenseTensor& label,
-                      paddle::optional<const DenseTensor&> weight,
+                      const paddle::optional<DenseTensor>& weight,
                       int64_t ignore_index,
                       const std::string& reduction,
                       DenseTensor* out,
diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
index 6745653eba7d1..45e4730e173fe 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -107,7 +107,7 @@ template <typename T, typename Context>
 void PsroiPoolGradKernel(const Context& ctx,
                          const DenseTensor& x,
                          const DenseTensor& rois,
-                         paddle::optional<const DenseTensor&> rois_num,
+                         const paddle::optional<DenseTensor>& rois_num,
                          const DenseTensor& dout,
                          int pooled_height,
                          int pooled_width,
diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
index 8f9be001ba763..f296d0d20743e 100644
--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -107,7 +107,7 @@ template <typename T, typename Context>
 void PsroiPoolKernel(const Context& ctx,
                      const DenseTensor& x,
                      const DenseTensor& rois,
-                     paddle::optional<const DenseTensor&> rois_num,
+                     const paddle::optional<DenseTensor>& rois_num,
                      int pooled_height,
                      int pooled_width,
                      int output_channels,
diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
index 76407475281da..98c2f618e7868 100644
--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -80,7 +80,7 @@ void RnnGradKernel(const Context &dev_ctx,
                    const DenseTensor &x,
                    const std::vector<const DenseTensor *> &pre_state,
                    const std::vector<const DenseTensor *> &weight_list,
-                   paddle::optional<const DenseTensor &> sequence_length,
+                   const paddle::optional<DenseTensor> &sequence_length,
                    const DenseTensor &out,
                    const DenseTensor &dropout_state,
                    const DenseTensor &reserve,
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index f2ffe3c9d4fba..5a19d5b89f0e3 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -134,7 +134,7 @@ void RnnKernel(const Context &dev_ctx,
                const DenseTensor &x,
                const std::vector<const DenseTensor *> &pre_state,
                const std::vector<const DenseTensor *> &weight_list,
-               paddle::optional<const DenseTensor &> sequence_length,
+               const paddle::optional<DenseTensor> &sequence_length,
                float dropout_prob,
                bool is_bidirec,
                int input_size,
diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
index cf076128b6939..9f9ea6753402b 100644
--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -172,7 +172,7 @@ template <typename T, typename Context>
 void RoiAlignGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& boxes,
-                        paddle::optional<const DenseTensor&> boxes_num,
+                        const paddle::optional<DenseTensor>& boxes_num,
                         const DenseTensor& out_grad,
                         int pooled_height,
                         int pooled_width,
diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu
index cb3375dee95a5..fc24179ed3d26 100644
--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -139,7 +139,7 @@ template <typename T, typename Context>
 void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
-                    paddle::optional<const DenseTensor&> boxes_num,
+                    const paddle::optional<DenseTensor>& boxes_num,
                     int pooled_height,
                     int pooled_width,
                     float spatial_scale,
diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
index d093a71d23f4e..1a5af93c562bf 100644
--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -75,7 +75,7 @@ template <typename T, typename Context>
 void RoiPoolGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& boxes,
-                       paddle::optional<const DenseTensor&> boxes_num,
+                       const paddle::optional<DenseTensor>& boxes_num,
                        const DenseTensor& arg_max,
                        const DenseTensor& out_grad,
                        int pooled_height,
diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
index ab33e2cf64751..32ea6223c9c2a 100644
--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -104,7 +104,7 @@ template <typename T, typename Context>
 void RoiPoolKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
-                   paddle::optional<const DenseTensor&> boxes_num,
+                   const paddle::optional<DenseTensor>& boxes_num,
                    int pooled_height,
                    int pooled_width,
                    float spatial_scale,
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index 7dd5a03383fd2..d71112a2f2884 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -69,7 +69,7 @@ void SGDDenseKernel(const Context& dev_ctx,
                     const DenseTensor& param,
                     const DenseTensor& learning_rate,
                     const DenseTensor& grad,
-                    paddle::optional<const DenseTensor&> master_param,
+                    const paddle::optional<DenseTensor>& master_param,
                     bool multi_precision,
                     DenseTensor* param_out,
                     DenseTensor* master_param_out) {
@@ -106,7 +106,7 @@ void SGDDenseParamSparseGradKernel(
     const DenseTensor& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const DenseTensor&> master_param,
+    const paddle::optional<DenseTensor>& master_param,
     bool multi_precision,
     DenseTensor* param_out,
     DenseTensor* master_param_out) {
@@ -175,7 +175,7 @@ void SGDSparseParamSparseGradKernel(
     const SelectedRows& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const SelectedRows&> master_param,
+    const paddle::optional<SelectedRows>& master_param,
     bool multi_precision,
     SelectedRows* param_out,
     SelectedRows* master_param_out) {
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index 58c7ea69869b3..b396e8fa6b0eb 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -47,8 +47,8 @@ void ConvCudnnGradGradKernel(
     const DenseTensor& input,
     const DenseTensor& filter,
     const DenseTensor& out_grad,
-    paddle::optional<const DenseTensor&> input_grad_grad,
-    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const std::vector<int>& strides,
     const std::vector<int>& paddings_t,
     const std::string& padding_algorithm,
@@ -670,8 +670,8 @@ void ConvCudnnGradGradKernel(
 template <typename T, typename Context>
 void DepthwiseConvCudnnGradGradKernel(
     const Context& ctx,
-    paddle::optional<const DenseTensor&> input_grad_grad,
-    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
@@ -711,8 +711,8 @@ void DepthwiseConvCudnnGradGradKernel(
 template <typename T, typename Context>
 void Conv3DCudnnGradGradKernel(
     const Context& ctx,
-    paddle::optional<const DenseTensor&> input_grad_grad,
-    paddle::optional<const DenseTensor&> filter_grad_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
diff --git a/paddle/phi/kernels/graph_reindex_kernel.h b/paddle/phi/kernels/graph_reindex_kernel.h
index 68f1ebc6f5cc4..12a742006ee73 100644
--- a/paddle/phi/kernels/graph_reindex_kernel.h
+++ b/paddle/phi/kernels/graph_reindex_kernel.h
@@ -23,8 +23,8 @@ void GraphReindexKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& neighbors,
                         const DenseTensor& count,
-                        paddle::optional<const DenseTensor&> hashtable_value,
-                        paddle::optional<const DenseTensor&> hashtable_index,
+                        const paddle::optional<DenseTensor>& hashtable_value,
+                        const paddle::optional<DenseTensor>& hashtable_index,
                         bool flag_buffer_hashtable,
                         DenseTensor* reindex_src,
                         DenseTensor* reindex_dst,
diff --git a/paddle/phi/kernels/graph_sample_neighbors_kernel.h b/paddle/phi/kernels/graph_sample_neighbors_kernel.h
index f7d205bd08ad0..065c7f141225d 100644
--- a/paddle/phi/kernels/graph_sample_neighbors_kernel.h
+++ b/paddle/phi/kernels/graph_sample_neighbors_kernel.h
@@ -24,8 +24,8 @@ void GraphSampleNeighborsKernel(
     const DenseTensor& row,
     const DenseTensor& col_ptr,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> eids,
-    paddle::optional<const DenseTensor&> perm_buffer,
+    const paddle::optional<DenseTensor>& eids,
+    const paddle::optional<DenseTensor>& perm_buffer,
     int sample_size,
     bool return_eids,
     bool flag_perm_buffer,
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
index c0b1a34d09c00..fbb6db358a476 100644
--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -25,8 +25,8 @@ void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& x,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
-                             paddle::optional<const DenseTensor&> out,
-                             paddle::optional<const DenseTensor&> dst_count,
+                             const paddle::optional<DenseTensor>& out,
+                             const paddle::optional<DenseTensor>& dst_count,
                              const DenseTensor& out_grad,
                              const std::string& pool_type,
                              DenseTensor* x_grad);
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
index 7922a767db23c..c0da8faadd592 100644
--- a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
+++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h
@@ -23,9 +23,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
index 619b022904b17..e32306b645a6f 100644
--- a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
+++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h
@@ -23,9 +23,9 @@ void HierarchicalSigmoidKernel(const Context& ctx,
                                const DenseTensor& x,
                                const DenseTensor& w,
                                const DenseTensor& label,
-                               paddle::optional<const DenseTensor&> path,
-                               paddle::optional<const DenseTensor&> code,
-                               paddle::optional<const DenseTensor&> bias,
+                               const paddle::optional<DenseTensor>& path,
+                               const paddle::optional<DenseTensor>& code,
+                               const paddle::optional<DenseTensor>& bias,
                                int num_classes,
                                bool remote_prefetch,
                                int trainer_id,
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 04391d2538c89..80dba29e76cbd 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -265,7 +265,7 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              const DenseTensor& dout,
                              const DenseTensor& ddx,
                              const DenseTensor& d_dout_new,
-                             paddle::optional<const DenseTensor&> d_ddout,
+                             const paddle::optional<DenseTensor>& d_ddout,
                              DenseTensor* d_out_new,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx) {
diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
index 3f30a4b958ebe..4a2e41532e9ff 100644
--- a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
+++ b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h
@@ -26,7 +26,7 @@ void BilinearTensorProductKernel(const Context& ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  const DenseTensor& weight,
-                                 paddle::optional<const DenseTensor&> bias,
+                                 const paddle::optional<DenseTensor>& bias,
                                  DenseTensor* out) {
   ctx.template Alloc<T>(out);
 
diff --git a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
index 64306bc827e4b..512b1529f9191 100644
--- a/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/conv_grad_grad_kernel_impl.h
@@ -29,8 +29,8 @@ void ConvGradGradKernel(const Context& dev_ctx,
                         const DenseTensor& input,
                         const DenseTensor& filter,
                         const DenseTensor& out_grad,
-                        paddle::optional<const DenseTensor&> input_grad_grad,
-                        paddle::optional<const DenseTensor&> filter_grad_grad,
+                        const paddle::optional<DenseTensor>& input_grad_grad,
+                        const paddle::optional<DenseTensor>& filter_grad_grad,
                         const std::vector<int>& strides_t,
                         const std::vector<int>& paddings_t,
                         const std::string& padding_algorithm,
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 8d8e66a02f5fb..744c48b2bfbd6 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -163,7 +163,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& offset,
                               const DenseTensor& filter,
-                              paddle::optional<const DenseTensor&> mask,
+                              const paddle::optional<DenseTensor>& mask,
                               const DenseTensor& out_grad,
                               const std::vector<int>& strides,
                               const std::vector<int>& paddings,
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index 6c0457024ddc4..f864c2e5f0ed0 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -28,7 +28,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& offset,
                           const DenseTensor& filter,
-                          paddle::optional<const DenseTensor&> mask,
+                          const paddle::optional<DenseTensor>& mask,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::vector<int>& dilations,
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 3c06b238d145c..73935640e349b 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -53,8 +53,8 @@ void AddGradImpl(const Context& dev_ctx,
 template <typename T, typename Context>
 void AddDoubleGradImpl(const Context& dev_ctx,
                        const DenseTensor& y,
-                       const paddle::optional<const DenseTensor&>& ddx,
-                       const paddle::optional<const DenseTensor&>& ddy,
+                       const paddle::optional<DenseTensor>& ddx,
+                       const paddle::optional<DenseTensor>& ddy,
                        const DenseTensor& dout,
                        int axis,
                        DenseTensor* ddout) {
@@ -87,8 +87,8 @@ void AddDoubleGradImpl(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradImpl(const Context& dev_ctx,
                             const DenseTensor& y,
-                            const paddle::optional<const DenseTensor&>& ddx,
-                            const paddle::optional<const DenseTensor&>& ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             const DenseTensor& dout,
                             int axis,
                             DenseTensor* ddout) {
@@ -160,8 +160,8 @@ void DivideDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& y,
                             const DenseTensor& out,
                             const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             int axis,
                             DenseTensor* dy,
                             DenseTensor* dout,
@@ -416,8 +416,8 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               int axis,
                               DenseTensor* dx,
                               DenseTensor* dy,
@@ -535,11 +535,11 @@ void MultiplyTripleGradKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const DenseTensor& y,
                               const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
+                              const paddle::optional<DenseTensor>& ddx,
+                              const paddle::optional<DenseTensor>& ddy,
                               const DenseTensor& d_dx,
                               const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
+                              const paddle::optional<DenseTensor>& d_ddout,
                               int axis,
                               DenseTensor* d_x,
                               DenseTensor* d_y,
diff --git a/paddle/phi/kernels/impl/expand_as_kernel_impl.h b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
index e5138e4e12c05..a5661aaa2ac16 100644
--- a/paddle/phi/kernels/impl/expand_as_kernel_impl.h
+++ b/paddle/phi/kernels/impl/expand_as_kernel_impl.h
@@ -93,7 +93,7 @@ void ExpandAs(const Context& context,
 template <typename T, typename Context>
 void ExpandAsKernel(const Context& ctx,
                     const DenseTensor& x,
-                    paddle::optional<const DenseTensor&> y,
+                    const paddle::optional<DenseTensor>& y,
                     const std::vector<int>& target_shape,
                     DenseTensor* out) {
   auto rank = x.dims().size();
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 25a9db868d357..5641e7a8274f3 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -473,8 +473,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& y,
                             const DenseTensor& dout,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* dx,
@@ -854,9 +854,9 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                             const DenseTensor& dout,
                             const DenseTensor& ddx,
                             const DenseTensor& ddy,
-                            paddle::optional<const DenseTensor&> d_dx,
-                            paddle::optional<const DenseTensor&> d_dy,
-                            paddle::optional<const DenseTensor&> d_ddout,
+                            const paddle::optional<DenseTensor>& d_dx,
+                            const paddle::optional<DenseTensor>& d_dy,
+                            const paddle::optional<DenseTensor>& d_ddout,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* out_d_x,
@@ -1790,8 +1790,8 @@ void MatmulWithFlattenDoubleGradKernel(
     const DenseTensor& x,
     const DenseTensor& y,
     const DenseTensor& out_grad,
-    paddle::optional<const DenseTensor&> x_grad_grad,
-    paddle::optional<const DenseTensor&> y_grad_grad,
+    const paddle::optional<DenseTensor>& x_grad_grad,
+    const paddle::optional<DenseTensor>& y_grad_grad,
     int x_num_col_dims,
     int y_num_col_dims,
     DenseTensor* x_grad,
diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
index 3aca225ad403b..825a3b9d56990 100644
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
@@ -408,7 +408,7 @@ void MomentumDenseImpl(const Context& ctx,
                        const DenseTensor& grad,
                        const DenseTensor& velocity,
                        const DenseTensor& learning_rate,
-                       paddle::optional<const DenseTensor&> master_param_opt,
+                       const paddle::optional<DenseTensor>& master_param_opt,
                        float mu_t,
                        bool use_nesterov,
                        const std::string& regularization_method,
@@ -500,7 +500,7 @@ void MomentumSparseImpl(const Context& ctx,
                         const SelectedRows& grad,
                         const DenseTensor& velocity,
                         const DenseTensor& learning_rate,
-                        paddle::optional<const DenseTensor&> master_param_opt,
+                        const paddle::optional<DenseTensor>& master_param_opt,
                         float mu_t,
                         bool use_nesterov,
                         const std::string& regularization_method,
@@ -602,7 +602,7 @@ void MomentumDenseKernel(const Context& dev_ctx,
                          const DenseTensor& grad,
                          const DenseTensor& velocity,
                          const DenseTensor& learning_rate,
-                         paddle::optional<const DenseTensor&> master_param,
+                         const paddle::optional<DenseTensor>& master_param,
                          float mu,
                          bool use_nesterov,
                          const std::string& regularization_method,
@@ -654,7 +654,7 @@ void MomentumSparseKernel(const Context& dev_ctx,
                           const SelectedRows& grad,
                           const DenseTensor& velocity,
                           const DenseTensor& learning_rate,
-                          paddle::optional<const DenseTensor&> master_param,
+                          const paddle::optional<DenseTensor>& master_param,
                           float mu,
                           bool use_nesterov,
                           const std::string& regularization_method,
diff --git a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
index 64b12837074dd..1954c5f20db3e 100644
--- a/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
+++ b/paddle/phi/kernels/impl/rmsprop_kernel_impl.h
@@ -146,7 +146,7 @@ void RmspropDenseKernel(const Context &ctx,
                         const DenseTensor &grad,
                         const DenseTensor &moment,
                         const DenseTensor &learning_rate,
-                        paddle::optional<const DenseTensor &> mean_grad_opt,
+                        const paddle::optional<DenseTensor> &mean_grad_opt,
                         float epsilon_t,
                         float decay_t,
                         float momentum_t,
@@ -196,11 +196,19 @@ void RmspropDenseKernel(const Context &ctx,
     if (centered) {
       auto mg_tensor = mean_grad_opt.get_ptr();
       auto mg = EigenVector<T>::Flatten(*mg_tensor);
-      PADDLE_ENFORCE_EQ(
-          mg_tensor,
-          mean_grad_out,
-          phi::errors::InvalidArgument(
-              "MeanGrad and MeanGradOut must be the same Tensor"));
+      if (mg_tensor) {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor->Holder(),
+            mean_grad_out->Holder(),
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor,
+            mean_grad_out,
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      }
       auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
 
       mg_out.device(place) = rho * mg + (1 - rho) * g;
@@ -217,12 +225,20 @@ void RmspropDenseKernel(const Context &ctx,
     funcs::ForRange<Context> for_range(ctx, limit);
     if (centered) {
       auto mg_tensor = mean_grad_opt.get_ptr();
+      if (mg_tensor) {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor->Holder(),
+            mean_grad_out->Holder(),
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            mg_tensor,
+            mean_grad_out,
+            phi::errors::InvalidArgument(
+                "MeanGrad and MeanGradOut must be the same Tensor"));
+      }
 
-      PADDLE_ENFORCE_EQ(
-          mg_tensor,
-          mean_grad_out,
-          phi::errors::InvalidArgument(
-              "MeanGrad and MeanGradOut must be the same Tensor"));
       for_range(CenteredRmspropFunctor<T, DenseRmspropGradFunctor<T>>(
           ctx.template Alloc<T>(param_out),
           ctx.template Alloc<T>(mean_square_out),
@@ -254,7 +270,7 @@ void RmspropSparseKernel(const Context &ctx,
                          const SelectedRows &grad,
                          const DenseTensor &moment,
                          const DenseTensor &learning_rate,
-                         paddle::optional<const DenseTensor &> mean_grad_opt,
+                         const paddle::optional<DenseTensor> &mean_grad_opt,
                          float epsilon_t,
                          float decay_t,
                          float momentum_t,
@@ -305,11 +321,20 @@ void RmspropSparseKernel(const Context &ctx,
 
   if (centered) {
     auto mg_tensor = mean_grad_opt.get_ptr();
+    if (mg_tensor) {
+      PADDLE_ENFORCE_EQ(
+          mg_tensor->Holder(),
+          mean_grad_out->Holder(),
+          phi::errors::InvalidArgument(
+              "MeanGrad and MeanGradOut must be the same Tensor"));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          mg_tensor,
+          mean_grad_out,
+          phi::errors::InvalidArgument(
+              "MeanGrad and MeanGradOut must be the same Tensor"));
+    }
 
-    PADDLE_ENFORCE_EQ(mg_tensor,
-                      mean_grad_out,
-                      phi::errors::InvalidArgument(
-                          "MeanGrad and MeanGradOut must be the same Tensor"));
     for_range(CenteredRmspropFunctor<T, SparseRmspropGradFunctor<T>>(
         ctx.template Alloc<T>(param_out),
         ctx.template Alloc<T>(mean_square_out),
diff --git a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
index 4ba1a0c6b6c0f..bd0ba26b99a43 100644
--- a/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_grad_kernel_impl.h
@@ -27,7 +27,7 @@ void SegmentPoolGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& segment_ids,
                            const DenseTensor& out,
-                           paddle::optional<const DenseTensor&> summed_ids,
+                           const paddle::optional<DenseTensor>& summed_ids,
                            const DenseTensor& out_grad,
                            const std::string& pooltype,
                            DenseTensor* x_grad) {
diff --git a/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h
index b788c966a1af1..b07628c981476 100644
--- a/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_grad_kernel_impl.h
@@ -32,7 +32,7 @@ void WarpctcGradKernel(const Context& dev_ctx,
                        const DenseTensor& warpctc_grad,
                        const DenseTensor& logits,
                        const DenseTensor& loss_grad,
-                       const paddle::optional<const DenseTensor&> logits_length,
+                       const paddle::optional<DenseTensor>& logits_length,
                        int blank,
                        bool norm_by_times,
                        DenseTensor* logits_grad) {
diff --git a/paddle/phi/kernels/impl/warpctc_kernel_impl.h b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
index ef6be7a9dfa88..6c792507c6f77 100644
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
@@ -229,8 +229,8 @@ template <typename T, typename Context>
 void WarpctcKernel(const Context& dev_ctx,
                    const DenseTensor& logits,
                    const DenseTensor& label,
-                   const paddle::optional<const DenseTensor&> logits_length,
-                   const paddle::optional<const DenseTensor&> labels_length,
+                   const paddle::optional<DenseTensor>& logits_length,
+                   const paddle::optional<DenseTensor>& labels_length,
                    int blank,
                    bool norm_by_times,
                    DenseTensor* warpctc_grad,
diff --git a/paddle/phi/kernels/instance_norm_grad_kernel.h b/paddle/phi/kernels/instance_norm_grad_kernel.h
index 7924c767ab61e..be7e4ce3e3488 100644
--- a/paddle/phi/kernels/instance_norm_grad_kernel.h
+++ b/paddle/phi/kernels/instance_norm_grad_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void InstanceNormGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& y_grad,
-                            paddle::optional<const DenseTensor&> scale,
+                            const paddle::optional<DenseTensor>& scale,
                             const DenseTensor& saved_mean,
                             const DenseTensor& saved_variance,
                             float epsilon,
@@ -33,13 +33,13 @@ void InstanceNormGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void InstanceNormDoubleGradKernel(const Context& dev_ctx,
                                   const DenseTensor& x,
-                                  paddle::optional<const DenseTensor&> scale,
+                                  const paddle::optional<DenseTensor>& scale,
                                   const DenseTensor& saved_mean,
                                   const DenseTensor& saved_variance,
                                   const DenseTensor& dy,
-                                  paddle::optional<const DenseTensor&> ddx,
-                                  paddle::optional<const DenseTensor&> ddscale,
-                                  paddle::optional<const DenseTensor&> ddbias,
+                                  const paddle::optional<DenseTensor>& ddx,
+                                  const paddle::optional<DenseTensor>& ddscale,
+                                  const paddle::optional<DenseTensor>& ddbias,
                                   float epsilon,
                                   DenseTensor* dx,
                                   DenseTensor* dscale,
diff --git a/paddle/phi/kernels/instance_norm_kernel.h b/paddle/phi/kernels/instance_norm_kernel.h
index 8c50025a73ce0..f8f1bbe1287a2 100644
--- a/paddle/phi/kernels/instance_norm_kernel.h
+++ b/paddle/phi/kernels/instance_norm_kernel.h
@@ -21,8 +21,8 @@ namespace phi {
 template <typename T, typename Context>
 void InstanceNormKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                        paddle::optional<const DenseTensor&> scale,
-                        paddle::optional<const DenseTensor&> bias,
+                        const paddle::optional<DenseTensor>& scale,
+                        const paddle::optional<DenseTensor>& bias,
                         float epsilon,
                         DenseTensor* y,
                         DenseTensor* saved_mean,
diff --git a/paddle/phi/kernels/interpolate_grad_kernel.h b/paddle/phi/kernels/interpolate_grad_kernel.h
index 59d2dddd87007..b8eefad61a768 100644
--- a/paddle/phi/kernels/interpolate_grad_kernel.h
+++ b/paddle/phi/kernels/interpolate_grad_kernel.h
@@ -22,9 +22,9 @@ template <typename T, typename Context>
 void BilinearInterpGradKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const DenseTensor& out_grad,
     const std::string& data_layout,
     int out_d,
diff --git a/paddle/phi/kernels/interpolate_kernel.h b/paddle/phi/kernels/interpolate_kernel.h
index 4623657f5a594..c531461c12e29 100644
--- a/paddle/phi/kernels/interpolate_kernel.h
+++ b/paddle/phi/kernels/interpolate_kernel.h
@@ -22,9 +22,9 @@ template <typename T, typename Context>
 void BilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -39,10 +39,9 @@ template <typename T, typename Context>
 void NearestInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -57,10 +56,9 @@ template <typename T, typename Context>
 void TrilinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -75,10 +73,9 @@ template <typename T, typename Context>
 void LinearInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
@@ -93,10 +90,9 @@ template <typename T, typename Context>
 void BicubicInterpKernel(
     const Context& ctx,
     const DenseTensor& x,
-    paddle::optional<const DenseTensor&> out_size,
-    paddle::optional<const std::vector<const DenseTensor*>> size_tensor,
-
-    paddle::optional<const DenseTensor&> scale_tensor,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
     const std::string& data_layout,
     int out_d,
     int out_h,
diff --git a/paddle/phi/kernels/label_smooth_kernel.h b/paddle/phi/kernels/label_smooth_kernel.h
index b7e1f2708894c..2db35e1bff346 100644
--- a/paddle/phi/kernels/label_smooth_kernel.h
+++ b/paddle/phi/kernels/label_smooth_kernel.h
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void LabelSmoothKernel(const Context& ctx,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> prior_dist,
+                       const paddle::optional<DenseTensor>& prior_dist,
                        float epsilon,
                        DenseTensor* out);
 
diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h
index 65f19a11b94d6..7d7cd13109be1 100644
--- a/paddle/phi/kernels/layer_norm_grad_kernel.h
+++ b/paddle/phi/kernels/layer_norm_grad_kernel.h
@@ -21,8 +21,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormGradKernel(const Context& ctx,
                          const DenseTensor& x,
-                         paddle::optional<const DenseTensor&> scale,
-                         paddle::optional<const DenseTensor&> bias,
+                         const paddle::optional<DenseTensor>& scale,
+                         const paddle::optional<DenseTensor>& bias,
                          const DenseTensor& mean,
                          const DenseTensor& variance,
                          const DenseTensor& out_grad,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index c9679420bda5c..26c04b61af96b 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -22,8 +22,8 @@ namespace phi {
 template <typename T, typename Context>
 void LayerNormKernel(const Context& ctx,
                      const DenseTensor& x,
-                     paddle::optional<const DenseTensor&> scale,
-                     paddle::optional<const DenseTensor&> bias,
+                     const paddle::optional<DenseTensor>& scale,
+                     const paddle::optional<DenseTensor>& bias,
                      float epsilon,
                      int begin_norm_axis,
                      bool is_test,
diff --git a/paddle/phi/kernels/matmul_grad_kernel.h b/paddle/phi/kernels/matmul_grad_kernel.h
index 41a835db46f71..47c6acdcb3923 100644
--- a/paddle/phi/kernels/matmul_grad_kernel.h
+++ b/paddle/phi/kernels/matmul_grad_kernel.h
@@ -34,8 +34,8 @@ void MatmulDoubleGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& y,
                             const DenseTensor& dout,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
+                            const paddle::optional<DenseTensor>& ddx,
+                            const paddle::optional<DenseTensor>& ddy,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* dx,
@@ -49,9 +49,9 @@ void MatmulTripleGradKernel(const Context& dev_ctx,
                             const DenseTensor& dout,
                             const DenseTensor& ddx,
                             const DenseTensor& ddy,
-                            paddle::optional<const DenseTensor&> d_dx,
-                            paddle::optional<const DenseTensor&> d_dy,
-                            paddle::optional<const DenseTensor&> d_ddout,
+                            const paddle::optional<DenseTensor>& d_dx,
+                            const paddle::optional<DenseTensor>& d_dy,
+                            const paddle::optional<DenseTensor>& d_ddout,
                             bool transpose_x,
                             bool transpose_y,
                             DenseTensor* out_d_x,
@@ -76,8 +76,8 @@ void MatmulWithFlattenDoubleGradKernel(
     const DenseTensor& x,
     const DenseTensor& y,
     const DenseTensor& out_grad,
-    paddle::optional<const DenseTensor&> x_grad_grad,
-    paddle::optional<const DenseTensor&> y_grad_grad,
+    const paddle::optional<DenseTensor>& x_grad_grad,
+    const paddle::optional<DenseTensor>& y_grad_grad,
     int x_num_col_dims,
     int y_num_col_dims,
     DenseTensor* x_grad,
diff --git a/paddle/phi/kernels/momentum_kernel.h b/paddle/phi/kernels/momentum_kernel.h
index b4ba449aaf3a5..172b345af163c 100644
--- a/paddle/phi/kernels/momentum_kernel.h
+++ b/paddle/phi/kernels/momentum_kernel.h
@@ -25,7 +25,7 @@ void MomentumDenseKernel(const Context& dev_ctx,
                          const DenseTensor& grad,
                          const DenseTensor& velocity,
                          const DenseTensor& learning_rate,
-                         paddle::optional<const DenseTensor&> master_param,
+                         const paddle::optional<DenseTensor>& master_param,
                          float mu,
                          bool use_nesterov,
                          const std::string& regularization_method,
@@ -42,7 +42,7 @@ void MomentumSparseKernel(const Context& dev_ctx,
                           const SelectedRows& grad,
                           const DenseTensor& velocity,
                           const DenseTensor& learning_rate,
-                          paddle::optional<const DenseTensor&> master_param,
+                          const paddle::optional<DenseTensor>& master_param,
                           float mu,
                           bool use_nesterov,
                           const std::string& regularization_method,
diff --git a/paddle/phi/kernels/nll_loss_grad_kernel.h b/paddle/phi/kernels/nll_loss_grad_kernel.h
index c06f0726899ee..b682edc24df0e 100644
--- a/paddle/phi/kernels/nll_loss_grad_kernel.h
+++ b/paddle/phi/kernels/nll_loss_grad_kernel.h
@@ -22,7 +22,7 @@ template <typename T, typename Context>
 void NllLossGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
                        const DenseTensor& label,
-                       paddle::optional<const DenseTensor&> weight,
+                       const paddle::optional<DenseTensor>& weight,
                        const DenseTensor& total_weight,
                        const DenseTensor& d_out,
                        int64_t ignore_index,
diff --git a/paddle/phi/kernels/nll_loss_kernel.cc b/paddle/phi/kernels/nll_loss_kernel.cc
index b271f0f4d06a0..cf6d4d01410b9 100644
--- a/paddle/phi/kernels/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/nll_loss_kernel.cc
@@ -19,7 +19,7 @@ template <typename T, typename Context>
 void NllLossKernel(const Context& dev_ctx,
                    const DenseTensor& input,
                    const DenseTensor& label,
-                   paddle::optional<const DenseTensor&> weight,
+                   const paddle::optional<DenseTensor>& weight,
                    int64_t ignore_index,
                    const std::string& reduction,
                    DenseTensor* out) {
diff --git a/paddle/phi/kernels/nll_loss_kernel.h b/paddle/phi/kernels/nll_loss_kernel.h
index 90083e1d6840d..cffaa31486025 100644
--- a/paddle/phi/kernels/nll_loss_kernel.h
+++ b/paddle/phi/kernels/nll_loss_kernel.h
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void NllLossRawKernel(const Context& dev_ctx,
                       const DenseTensor& input,
                       const DenseTensor& label,
-                      paddle::optional<const DenseTensor&> weight,
+                      const paddle::optional<DenseTensor>& weight,
                       int64_t ignore_index,
                       const std::string& reduction,
                       DenseTensor* out,
diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h
index 87163eb8e079f..8dcf81194e269 100644
--- a/paddle/phi/kernels/psroi_pool_grad_kernel.h
+++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void PsroiPoolGradKernel(const Context& ctx,
                          const DenseTensor& x,
                          const DenseTensor& rois,
-                         paddle::optional<const DenseTensor&> rois_num,
+                         const paddle::optional<DenseTensor>& rois_num,
                          const DenseTensor& dout,
                          int pooled_height,
                          int pooled_width,
diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h
index 341037af2caec..5838fa895119d 100644
--- a/paddle/phi/kernels/psroi_pool_kernel.h
+++ b/paddle/phi/kernels/psroi_pool_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void PsroiPoolKernel(const Context& ctx,
                      const DenseTensor& x,
                      const DenseTensor& rois,
-                     paddle::optional<const DenseTensor&> rois_num,
+                     const paddle::optional<DenseTensor>& rois_num,
                      int pooled_height,
                      int pooled_width,
                      int output_channels,
diff --git a/paddle/phi/kernels/rmsprop_kernel.h b/paddle/phi/kernels/rmsprop_kernel.h
index 4c3c9aa822115..fba2095cc8bce 100644
--- a/paddle/phi/kernels/rmsprop_kernel.h
+++ b/paddle/phi/kernels/rmsprop_kernel.h
@@ -26,7 +26,7 @@ void RmspropDenseKernel(const Context& dev_ctx,
                         const DenseTensor& grad,
                         const DenseTensor& moment,
                         const DenseTensor& learning_rate,
-                        paddle::optional<const DenseTensor&> mean_grad,
+                        const paddle::optional<DenseTensor>& mean_grad,
                         float epsilon,
                         float decay,
                         float momentum,
@@ -43,7 +43,7 @@ void RmspropSparseKernel(const Context& dev_ctx,
                          const SelectedRows& grad,
                          const DenseTensor& moment,
                          const DenseTensor& learning_rate,
-                         paddle::optional<const DenseTensor&> mean_grad,
+                         const paddle::optional<DenseTensor>& mean_grad,
                          float epsilon,
                          float decay,
                          float momentum,
diff --git a/paddle/phi/kernels/rnn_grad_kernel.h b/paddle/phi/kernels/rnn_grad_kernel.h
index e5b1100cf7203..024ed287bb13f 100644
--- a/paddle/phi/kernels/rnn_grad_kernel.h
+++ b/paddle/phi/kernels/rnn_grad_kernel.h
@@ -24,7 +24,7 @@ void RnnGradKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const std::vector<const DenseTensor*>& pre_state,
                    const std::vector<const DenseTensor*>& weight_list,
-                   paddle::optional<const DenseTensor&> sequence_length,
+                   const paddle::optional<DenseTensor>& sequence_length,
                    const DenseTensor& out,
                    const DenseTensor& dropout_state,
                    const DenseTensor& reserve,
diff --git a/paddle/phi/kernels/rnn_kernel.h b/paddle/phi/kernels/rnn_kernel.h
index f1534aa598844..61dfb6f56d798 100644
--- a/paddle/phi/kernels/rnn_kernel.h
+++ b/paddle/phi/kernels/rnn_kernel.h
@@ -24,7 +24,7 @@ void RnnKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const std::vector<const DenseTensor*>& pre_state,
                const std::vector<const DenseTensor*>& weight_list,
-               paddle::optional<const DenseTensor&> sequence_length,
+               const paddle::optional<DenseTensor>& sequence_length,
                float dropout_prob,
                bool is_bidirec,
                int input_size,
diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h
index eea1fa03886a4..a7c2ed3beb53a 100644
--- a/paddle/phi/kernels/roi_align_grad_kernel.h
+++ b/paddle/phi/kernels/roi_align_grad_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void RoiAlignGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
                         const DenseTensor& boxes,
-                        paddle::optional<const DenseTensor&> boxes_num,
+                        const paddle::optional<DenseTensor>& boxes_num,
                         const DenseTensor& out_grad,
                         int pooled_height,
                         int pooled_width,
diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h
index 9734da53b7f45..fa3161e3238df 100644
--- a/paddle/phi/kernels/roi_align_kernel.h
+++ b/paddle/phi/kernels/roi_align_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void RoiAlignKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& boxes,
-                    paddle::optional<const DenseTensor&> boxes_num,
+                    const paddle::optional<DenseTensor>& boxes_num,
                     int pooled_height,
                     int pooled_width,
                     float spatial_scale,
diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h
index d7f1c378f75c3..f18bd1d65e644 100644
--- a/paddle/phi/kernels/roi_pool_grad_kernel.h
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -23,7 +23,7 @@ template <typename T, typename Context>
 void RoiPooGradKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& boxes,
-                      paddle::optional<const DenseTensor&> boxes_num,
+                      const paddle::optional<DenseTensor>& boxes_num,
                       const DenseTensor& arg_max,
                       const DenseTensor& out_grad,
                       int pooled_height,
diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h
index c6ff6f223612a..e7ed2587968f5 100644
--- a/paddle/phi/kernels/roi_pool_kernel.h
+++ b/paddle/phi/kernels/roi_pool_kernel.h
@@ -25,7 +25,7 @@ template <typename T, typename Context>
 void RoiPoolKernel(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& boxes,
-                   paddle::optional<const DenseTensor&> boxes_num,
+                   const paddle::optional<DenseTensor>& boxes_num,
                    int pooled_height,
                    int pooled_width,
                    float spatial_scale,
diff --git a/paddle/phi/kernels/segment_pool_grad_kernel.h b/paddle/phi/kernels/segment_pool_grad_kernel.h
index e773eed16e8c8..edf9ff9c7568c 100644
--- a/paddle/phi/kernels/segment_pool_grad_kernel.h
+++ b/paddle/phi/kernels/segment_pool_grad_kernel.h
@@ -23,7 +23,7 @@ void SegmentPoolGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& segment_ids,
                            const DenseTensor& out,
-                           paddle::optional<const DenseTensor&> summed_ids,
+                           const paddle::optional<DenseTensor>& summed_ids,
                            const DenseTensor& out_grad,
                            const std::string& pooltype,
                            DenseTensor* x_grad);
diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.h b/paddle/phi/kernels/selected_rows/adam_kernel.h
index 2e13d29d17284..79f87a8ed75c0 100644
--- a/paddle/phi/kernels/selected_rows/adam_kernel.h
+++ b/paddle/phi/kernels/selected_rows/adam_kernel.h
@@ -31,8 +31,8 @@ void AdamDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.h b/paddle/phi/kernels/selected_rows/adamw_kernel.h
index ddb155ce4504e..5dda8107d52e3 100644
--- a/paddle/phi/kernels/selected_rows/adamw_kernel.h
+++ b/paddle/phi/kernels/selected_rows/adamw_kernel.h
@@ -31,8 +31,8 @@ void AdamwDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index fae876facfc8f..f0c0ffb591a11 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -20,7 +20,7 @@
 namespace phi {
 namespace sr {
 
-// Note: use `const paddle::optional<const SelectedRows&> x`
+// Note: use `const paddle::optional<SelectedRows>& x`
 // as input if needed
 template <typename Context>
 void AssignKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index 57e33beb95e3e..d96c707538e41 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -35,8 +35,8 @@ void AdamDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
index a52bca761108c..6d2fc164d6b33 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
@@ -35,8 +35,8 @@ void AdamwDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
index 31abac149951d..18b6da818a1f3 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -102,8 +102,8 @@ void AdamDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index b847f48d12267..182c4390b1722 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -112,8 +112,8 @@ void AdamwDenseParamSparseGradKernel(
     const DenseTensor& moment2,
     const DenseTensor& beta1_pow,
     const DenseTensor& beta2_pow,
-    paddle::optional<const DenseTensor&> master_param,
-    paddle::optional<const DenseTensor&> skip_update,
+    const paddle::optional<DenseTensor>& master_param,
+    const paddle::optional<DenseTensor>& skip_update,
     const Scalar& beta1,
     const Scalar& beta2,
     const Scalar& epsilon,
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
index 1660601bbd36e..616786d210df7 100644
--- a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc
@@ -40,9 +40,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
index 4c03b83d80fff..aca355f515c44 100644
--- a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
+++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h
@@ -25,9 +25,9 @@ void HierarchicalSigmoidGradKernel(const Context& ctx,
                                    const DenseTensor& x,
                                    const DenseTensor& w,
                                    const DenseTensor& label,
-                                   paddle::optional<const DenseTensor&> path,
-                                   paddle::optional<const DenseTensor&> code,
-                                   paddle::optional<const DenseTensor&> bias,
+                                   const paddle::optional<DenseTensor>& path,
+                                   const paddle::optional<DenseTensor>& code,
+                                   const paddle::optional<DenseTensor>& bias,
                                    const DenseTensor& pre_out,
                                    const DenseTensor& out_grad,
                                    int num_classes,
diff --git a/paddle/phi/kernels/sgd_kernel.h b/paddle/phi/kernels/sgd_kernel.h
index 12361c738e247..226a719b90244 100644
--- a/paddle/phi/kernels/sgd_kernel.h
+++ b/paddle/phi/kernels/sgd_kernel.h
@@ -24,7 +24,7 @@ void SGDDenseKernel(const Context& dev_ctx,
                     const DenseTensor& param,
                     const DenseTensor& learning_rate,
                     const DenseTensor& grad,
-                    paddle::optional<const DenseTensor&> master_param,
+                    const paddle::optional<DenseTensor>& master_param,
                     bool multi_precision,
                     DenseTensor* param_out,
                     DenseTensor* master_param_out);
@@ -35,7 +35,7 @@ void SGDDenseParamSparseGradKernel(
     const DenseTensor& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const DenseTensor&> master_param,
+    const paddle::optional<DenseTensor>& master_param,
     bool multi_precision,
     DenseTensor* param_out,
     DenseTensor* master_param_out);
@@ -46,7 +46,7 @@ void SGDSparseParamSparseGradKernel(
     const SelectedRows& param,
     const DenseTensor& learning_rate,
     const SelectedRows& grad,
-    paddle::optional<const SelectedRows&> master_param,
+    const paddle::optional<SelectedRows>& master_param,
     bool multi_precision,
     SelectedRows* param_out,
     SelectedRows* master_param_out);
diff --git a/paddle/phi/kernels/warpctc_grad_kernel.h b/paddle/phi/kernels/warpctc_grad_kernel.h
index 8e1ab43324a50..8a8251aabe468 100644
--- a/paddle/phi/kernels/warpctc_grad_kernel.h
+++ b/paddle/phi/kernels/warpctc_grad_kernel.h
@@ -24,7 +24,7 @@ void WarpctcGradKernel(const Context& dev_ctx,
                        const DenseTensor& warpctc_grad,
                        const DenseTensor& logits,
                        const DenseTensor& loss_grad,
-                       paddle::optional<const DenseTensor&> logits_length,
+                       const paddle::optional<DenseTensor>& logits_length,
                        int blank,
                        bool norm_by_times,
                        DenseTensor* logits_grad);
diff --git a/paddle/phi/kernels/warpctc_kernel.h b/paddle/phi/kernels/warpctc_kernel.h
index 4baa49064775e..0b9e9eb87f675 100644
--- a/paddle/phi/kernels/warpctc_kernel.h
+++ b/paddle/phi/kernels/warpctc_kernel.h
@@ -23,8 +23,8 @@ template <typename T, typename Context>
 void WarpctcKernel(const Context& dev_ctx,
                    const DenseTensor& logits,
                    const DenseTensor& label,
-                   paddle::optional<const DenseTensor&> logits_length,
-                   paddle::optional<const DenseTensor&> labels_length,
+                   const paddle::optional<DenseTensor>& logits_length,
+                   const paddle::optional<DenseTensor>& labels_length,
                    int blank,
                    bool norm_by_times,
                    DenseTensor* warpctc_grad,
diff --git a/paddle/phi/kernels/yolov3_loss_grad_kernel.h b/paddle/phi/kernels/yolov3_loss_grad_kernel.h
index 789e782443f68..b4ce5b9539813 100644
--- a/paddle/phi/kernels/yolov3_loss_grad_kernel.h
+++ b/paddle/phi/kernels/yolov3_loss_grad_kernel.h
@@ -23,7 +23,7 @@ void Yolov3LossGradKernel(const Context& dev_ctx,
                           const DenseTensor& x,
                           const DenseTensor& gt_box,
                           const DenseTensor& gt_label,
-                          paddle::optional<const DenseTensor&> gt_score,
+                          const paddle::optional<DenseTensor>& gt_score,
                           const DenseTensor& loss_grad,
                           const DenseTensor& objectness_mask,
                           const DenseTensor& gt_match_mask,
diff --git a/paddle/phi/kernels/yolov3_loss_kernel.h b/paddle/phi/kernels/yolov3_loss_kernel.h
index eb6668000dee0..3dabe5ce820ee 100644
--- a/paddle/phi/kernels/yolov3_loss_kernel.h
+++ b/paddle/phi/kernels/yolov3_loss_kernel.h
@@ -23,7 +23,7 @@ void Yolov3LossKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const DenseTensor& gt_box,
                       const DenseTensor& gt_label,
-                      paddle::optional<const DenseTensor&> gt_score,
+                      const paddle::optional<DenseTensor>& gt_score,
                       const std::vector<int>& anchors,
                       const std::vector<int>& anchor_mask,
                       int class_num,
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 146925ccef6d5..1638f6afab20c 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -141,7 +141,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'int[]': 'const std::vector<int>&'
         }
         optional_types_trans = {
-            'Tensor': 'paddle::optional<const Tensor&>',
+            'Tensor': 'const paddle::optional<Tensor>&',
             'Tensor[]': 'const paddle::optional<std::vector<Tensor>>&',
             'int': 'paddle::optional<int>',
             'int32_t': 'paddle::optional<int32_t>',
@@ -512,18 +512,7 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
 
                     param_code = param_code + param + "_metas, "
                 elif param in self.optional_vars:
-                    meta_tensor_code = meta_tensor_code + f"""
-{code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param} = paddle::none;
-{code_indent}  phi::DenseTensor {param}_dt;
-{code_indent}  phi::MetaTensor {PREFIX_TENSOR_NAME}meta_tmp_{param}({param}_dt);
-{code_indent}  if ({PREFIX_TENSOR_NAME}{param}_ptr) {{
-{code_indent}    {PREFIX_TENSOR_NAME}meta_tmp_{param}.set_dtype( {PREFIX_TENSOR_NAME}{param}_ptr->dtype() );
-{code_indent}    {PREFIX_TENSOR_NAME}meta_tmp_{param}.set_dims( {PREFIX_TENSOR_NAME}{param}_ptr->dims() );
-{code_indent}    {PREFIX_TENSOR_NAME}meta_tmp_{param}.set_layout( {PREFIX_TENSOR_NAME}{param}_ptr->layout() );
-{code_indent}    {PREFIX_TENSOR_NAME}meta_ref_{param} =  {PREFIX_TENSOR_NAME}meta_tmp_{param};
-{code_indent}  }}\n"""
-
-                    param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
+                    param_code = param_code + "MakeMetaTensor(" + PREFIX_TENSOR_NAME + param + "), "
                 else:
                     raise ValueError(
                         f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
@@ -568,8 +557,8 @@ def get_kernel_args(self, code_indent):
             'const std::vector<const phi::DenseTensor*>&',
             'const paddle::optional<Tensor&>':
             'paddle::optional<const phi::DenseTensor&>',
-            'paddle::optional<const Tensor&>':
-            'paddle::optional<const phi::DenseTensor&>',
+            'const paddle::optional<Tensor>&':
+            'const paddle::optional<phi::DenseTensor>&',
             'const paddle::optional<std::vector<Tensor>>&':
             'paddle::optional<const std::vector<phi::DenseTensor>&>'
         }
@@ -597,11 +586,7 @@ def get_kernel_args(self, code_indent):
                     trans_flag = "{false, true}"
                 if input_name in self.optional_vars:
                     input_tensor_code = input_tensor_code + f"""
-{code_indent}  {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none);
-{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_ptr = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});
-{code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{
-{code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional<const phi::DenseTensor&>(*{PREFIX_TENSOR_NAME}{input_name}_ptr);
-{code_indent}  }}"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});"""
 
                 else:
                     if self.inputs['input_info'][input_name] == "const Tensor&":
@@ -677,7 +662,7 @@ def get_selected_rows_kernel_args(self, code_indent):
         input_trans_map = {
             'const Tensor&': 'const phi::SelectedRows&',
             'const paddle::optional<Tensor>&':
-            'paddle::optional<const phi::SelectedRows&>'
+            'const paddle::optional<phi::SelectedRows>&'
         }
         out_trans_map = {'Tensor': 'phi::SelectedRows*'}
         input_names = self.inputs['names']
diff --git a/python/paddle/utils/code_gen/type_mapping.py b/python/paddle/utils/code_gen/type_mapping.py
index ecbd1f494c2ee..c6e110907a9f7 100644
--- a/python/paddle/utils/code_gen/type_mapping.py
+++ b/python/paddle/utils/code_gen/type_mapping.py
@@ -108,7 +108,7 @@
 sr_input_types_map = {'Tensor': 'const phi::SelectedRows&', }
 
 sr_optional_input_types_map = {
-    'Tensor': 'paddle::optional<const phi::SelectedRows&>',
+    'Tensor': 'const paddle::optional<phi::SelectedRows>&',
 }
 
 sr_output_types_map = {'Tensor': 'phi::SelectedRows*', }
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index c14d39e9842be..bf798f9734d53 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -43,10 +43,7 @@ def gene_wrapped_infermeta_and_register(api):
                 'const std::vector<Tensor>&': 'const std::vector<MetaTensor>&',
                 'Tensor': 'MetaTensor*',
                 'std::vector<Tensor>': 'std::vector<MetaTensor>*',
-                'const paddle::optional<Tensor&>':
-                'const paddle::optional<MetaTensor&>',
-                'paddle::optional<const Tensor&>':
-                'paddle::optional<const MetaTensor&>'
+                'const paddle::optional<Tensor>&': 'const MetaTensor&'
             }
 
             wrapped_infermeta_name = get_wrapped_infermeta_name(api.api)

From 4d32f417a435446d06541ae951edc2404e97e74c Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Fri, 27 May 2022 21:36:35 +0800
Subject: [PATCH 057/109] [Eager] Support EagerParamBase init by
 'shape'(Tensor) (#43045)

---
 python/paddle/fluid/framework.py                       |  3 +++
 .../fluid/tests/unittests/test_egr_python_api.py       | 10 ++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 6957dd8c5e30c..757b1a2da95b9 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -6619,6 +6619,9 @@ def __init__(self, shape, dtype, **kwargs):
 
         name = kwargs.get('name', unique_name.generate('_eager_param_base'))
 
+        if isinstance(shape, core.eager.Tensor):
+            shape = shape.numpy()
+
         super(EagerParamBase, self).__init__(
             dtype if dtype else core.VarDesc.VarType.FP32,
             list(shape)
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 600a49b2332be..bb8c6346eb5a5 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -279,6 +279,16 @@ def constructor(self, place):
                 "The type of trainable MUST be bool, but the type is /*"):
             eager_param.trainable = "False"
 
+        eager_param_2 = EagerParamBase(
+            shape=paddle.shape(paddle.to_tensor([1, 2, 3, 4])), dtype="float32")
+        self.assertTrue(eager_param_2.trainable)
+        eager_param_2.trainable = False
+        self.assertFalse(eager_param_2.trainable)
+        with self.assertRaisesRegexp(
+                ValueError,
+                "The type of trainable MUST be bool, but the type is /*"):
+            eager_param_2.trainable = "False"
+
     def test_constructor(self):
         print("Test_constructor")
         paddle.set_device("cpu")

From 9eb18c75a39816c91d8456ae455fe403ac62d451 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Fri, 27 May 2022 21:39:22 +0800
Subject: [PATCH 058/109] [Eager] Support is empty (#43032)

* support is empty

* fix error

* fix code error

* change to fake empty

* using fake empty first

* using fake empty first
---
 .../eager/accumulation/accumulation_node.cc   | 54 +++++++++++--------
 .../eager/accumulation/accumulation_node.h    |  8 +--
 paddle/fluid/pybind/eager_method.cc           |  8 ++-
 3 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 2ed44ce489934..544e7c8fe85d6 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -28,33 +28,40 @@
 namespace egr {
 
 static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
-                            const paddle::experimental::Tensor& t) {
-  if (!tensor->defined() || !tensor->initialized()) {
-    // Simply copy tensor->impl
+                            const paddle::experimental::Tensor& t,
+                            bool is_fake_empty) {
+  if (is_fake_empty) {
     *tensor = t;
   } else {
-    // Accumulation
-    if (LIKELY(t.is_dense_tensor())) {
-      if (LIKELY(tensor->is_dense_tensor())) {
-        paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t, tensor);
+    if (!tensor->defined() || !tensor->initialized()) {
+      // Simply copy tensor->impl
+      *tensor = t;
+    } else {
+      // Accumulation
+      if (LIKELY(t.is_dense_tensor())) {
+        if (LIKELY(tensor->is_dense_tensor())) {
+          paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t,
+                                                                      tensor);
+        } else {
+          // TODO(jiabin): Support Other TensorBase later
+          // TODO(zhanlve): Replace SelectedRowsAddTensor with
+          // add_dygraph_function once it's supported
+          paddle::experimental::Tensor new_buffer(
+              std::make_shared<phi::DenseTensor>(), "tmp_accumulator");
+          paddle::imperative::SelectedRowsAddTensor(*tensor, t, &new_buffer);
+          tensor->set_impl(new_buffer.impl());
+        }
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with
-        // add_dygraph_function once it's supported
-        paddle::experimental::Tensor new_buffer(
-            std::make_shared<phi::DenseTensor>(), "tmp_accumulator");
-        paddle::imperative::SelectedRowsAddTensor(*tensor, t, &new_buffer);
-        tensor->set_impl(new_buffer.impl());
-      }
-    } else {
-      // TODO(jiabin): Support Other TensorBase later
-      // TODO(zhanlve): Replace SelectedRowsAddTensor with add_dygraph_function
-      // once it's supported
-      if (tensor->is_dense_tensor()) {
-        paddle::imperative::SelectedRowsAddToTensor(t, tensor);
-      } else {
-        *tensor = std::move(*paddle::imperative::SelectedRowsMerge<
-                            paddle::experimental::Tensor>(t, *tensor));
+        // add_dygraph_function
+        // once it's supported
+        if (tensor->is_dense_tensor()) {
+          paddle::imperative::SelectedRowsAddToTensor(t, tensor);
+        } else {
+          *tensor = std::move(*paddle::imperative::SelectedRowsMerge<
+                              paddle::experimental::Tensor>(t, *tensor));
+        }
       }
     }
   }
@@ -91,7 +98,8 @@ GradNodeAccumulation::operator()(
 
   if (!weak_grad_.expired() && !is_new_grad) {
     auto grad = weak_grad_.lock();
-    CopyOrAddTensor(grad.get(), grad_out);
+    CopyOrAddTensor(grad.get(), grad_out, is_fake_empty_);
+    is_fake_empty_ = false;
   }
 
   // Apply Reduce Hooks
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index f37de9c8e88f1..6374534578cb8 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -64,14 +64,16 @@ class GradNodeAccumulation : public GradNodeBase {
         new GradNodeAccumulation(nullptr));
   }
 
+  void SetFakeEmpty(bool is_fake_empty) { is_fake_empty_ = is_fake_empty; }
+
  private:
+  // TODO(Jiabin): remove this when we make our clear gradient really cleared;
+  bool is_fake_empty_ = {false};
   std::weak_ptr<paddle::experimental::Tensor> weak_grad_;
-
+  std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
   std::function<paddle::experimental::Tensor(
       const paddle::experimental::Tensor&)>
       retain_grad_hook_;
-
-  std::vector<std::shared_ptr<TensorVoidHook>> reduce_hooks_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 1a0838d7f47c6..b54f4e1416c35 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -494,7 +494,8 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
   }
 
   paddle::experimental::Tensor* grad;
-  if (egr::egr_utils_api::IsLeafTensor(self->tensor)) {
+  bool is_leaf = egr::egr_utils_api::IsLeafTensor(self->tensor);
+  if (is_leaf) {
     grad = egr::EagerUtils::mutable_grad(self->tensor);
     PADDLE_ENFORCE(grad != nullptr,
                    paddle::platform::errors::Fatal(
@@ -518,6 +519,11 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
       if (grad->initialized()) {
         if (set_to_zero) {
           grad->set_impl(paddle::experimental::zeros_like(*grad).impl());
+          if (is_leaf) {
+            std::static_pointer_cast<egr::GradNodeAccumulation>(
+                egr::EagerUtils::grad_node(self->tensor))
+                ->SetFakeEmpty(true);
+          }
         } else {
           VLOG(4) << "Gradient of " << self->tensor.name()
                   << " is initialized, will be released.";

From 8cc2e28c7ed3c4826de9c82f60368b06bd111918 Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Sat, 28 May 2022 16:32:41 +0800
Subject: [PATCH 059/109] [Bug Fix]Fix global_scatter/global_gather in
 ProcessGroup (#43027)

* fix alltoall

* rename utest
---
 .../distributed/collective/ProcessGroup.h     |  13 ++
 .../collective/ProcessGroupNCCL.cc            |  47 +++++++
 .../distributed/collective/ProcessGroupNCCL.h |   8 ++
 .../collective/global_gather_op.cu.cc         | 132 +++++++++++++++++-
 .../operators/collective/global_gather_op.h   |  11 ++
 .../collective/global_scatter_op.cu.cc        | 130 ++++++++++++++++-
 .../operators/collective/global_scatter_op.h  |  11 ++
 .../fluid/tests/unittests/CMakeLists.txt      |   4 +-
 .../unittests/test_collective_api_base.py     |   9 +-
 .../test_collective_global_gather.py          |  11 +-
 .../test_collective_global_scatter.py         |  11 +-
 11 files changed, 376 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index fca395c5f2bf7..52e09792d5d80 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -113,6 +113,19 @@ class ProcessGroup {
         "ProcessGroup%s does not support receive", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
+                                                           int, int,
+                                                           int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors, int, int, int) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support receive", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>&,    // NOLINT
       std::vector<phi::DenseTensor>&) {  // NOLINT
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 86cc5b5db7cd7..f1b66864b2930 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -428,6 +428,53 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
+    phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
+  // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
+
+  std::vector<phi::DenseTensor> shared_tensors;
+  shared_tensors.push_back(shared_input);
+
+  auto task = PointToPoint(shared_tensors,
+                           [&](phi::DenseTensor& input, ncclComm_t comm,
+                               const gpuStream_t& stream, int dst_rank) {
+                             return platform::dynload::ncclSend(
+                                 input.data(), input.numel(),
+                                 platform::ToNCCLDataType(input.dtype()),
+                                 dst_rank, comm, stream);
+                           },
+                           dst_rank, CommType::SEND);
+  return task;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
+    phi::DenseTensor& tensors, int src_rank, int offset, int length) {
+  // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
+
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
+
+  std::vector<phi::DenseTensor> shared_tensors;
+  shared_tensors.push_back(shared_input);
+
+  auto task = PointToPoint(shared_tensors,
+                           [&](phi::DenseTensor& output, ncclComm_t comm,
+                               const gpuStream_t& stream, int src_rank) {
+                             return platform::dynload::ncclRecv(
+                                 output.data(), output.numel(),
+                                 platform::ToNCCLDataType(output.dtype()),
+                                 src_rank, comm, stream);
+                           },
+                           src_rank, CommType::RECV);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 4b6c3f4031354..82ced6e135ac9 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -102,6 +102,14 @@ class ProcessGroupNCCL : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> Recv(
       std::vector<phi::DenseTensor>& tensors, int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
+                                                   int dst_rank, int offset,
+                                                   int length) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
+                                                   int src_rank, int offset,
+                                                   int length) override;
+
   std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 6684470e881cb..c256063090cc8 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
 template <typename T>
-class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+struct GlobalGatherFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
@@ -137,6 +137,132 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto local_count = ctx.Input<framework::LoDTensor>("local_count");
+    auto global_count = ctx.Input<framework::LoDTensor>("global_count");
+    auto local_count_type =
+        framework::TransToProtoVarType(local_count->dtype());
+    auto global_count_type =
+        framework::TransToProtoVarType(global_count->dtype());
+    if (local_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in local_count."));
+    }
+    if (global_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in global_count."));
+    }
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    const int64_t* cpu_local_count_data;
+    const int64_t* cpu_global_count_data;
+    auto local_count_len = 0;
+
+    framework::Tensor cpu_local_count;
+    if (platform::is_cpu_place(local_count->place())) {
+      cpu_local_count_data = local_count->data<int64_t>();
+      local_count_len = local_count->numel();
+    } else {
+      framework::TensorCopySync(*local_count, platform::CPUPlace(),
+                                &cpu_local_count);
+      cpu_local_count_data = cpu_local_count.data<int64_t>();
+      local_count_len = cpu_local_count.numel();
+    }
+
+    framework::Tensor cpu_global_count;
+    if (platform::is_cpu_place(global_count->place())) {
+      cpu_global_count_data = global_count->data<int64_t>();
+    } else {
+      framework::TensorCopySync(*global_count, platform::CPUPlace(),
+                                &cpu_global_count);
+      cpu_global_count_data = cpu_global_count.data<int64_t>();
+    }
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for global gather op must be non-negative.",
+            ring_id));
+    auto place = ctx.GetPlace();
+
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    distributed::ProcessGroup* pg = map->get(ring_id);
+
+    int nranks = pg->GetSize();
+    auto in_feat = x->dims()[1];
+    auto n_expert = local_count->dims()[0] / nranks;
+
+    auto fwd_count = 0;
+
+    for (auto i = 0; i < local_count_len; ++i) {
+      fwd_count += cpu_local_count_data[i];
+    }
+    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    int64_t* expert_ptr = new int64_t[n_expert * nranks];
+    expert_ptr[0] = 0;
+    auto tot_experts = n_expert * nranks;
+    for (auto i = 1; i < tot_experts; ++i) {
+      expert_ptr[i] = expert_ptr[i - 1] + cpu_local_count_data[i - 1];
+    }
+    auto send_ptr = 0;
+    out->mutable_data<T>(out_dims, place);
+
+    for (auto i = 0; i < n_expert; ++i) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+      for (auto j = 0; j < nranks; ++j) {
+        int idx = i + j * n_expert;
+        if (cpu_global_count_data[idx]) {
+          phi::DenseTensor tmp = *x;
+          pg->Send_Partial(tmp, j, send_ptr * in_feat,
+                           cpu_global_count_data[idx] * in_feat);
+          send_ptr += cpu_global_count_data[idx];
+        }
+        if (cpu_local_count_data[idx]) {
+          pg->Recv_Partial(*out, j, expert_ptr[idx] * in_feat,
+                           cpu_local_count_data[idx] * in_feat);
+        }
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+    }
+
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
+
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+template <typename T>
+class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const int rid = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      GlobalGatherProcessGroupFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    } else {
+      GlobalGatherFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
index 3ff2df9e48f3d..47212b1d15581 100644
--- a/paddle/fluid/operators/collective/global_gather_op.h
+++ b/paddle/fluid/operators/collective/global_gather_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -33,5 +34,15 @@ class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename Context, typename T>
+struct GlobalGatherFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
+template <typename Context, typename T>
+struct GlobalGatherProcessGroupFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index cd3c3a3229ca0..df8d675ec9d71 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -22,10 +22,10 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
 template <typename T>
-class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+struct GlobalScatterFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
@@ -137,6 +137,130 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
+  void operator()(const framework::ExecutionContext& ctx) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto local_count = ctx.Input<framework::LoDTensor>("local_count");
+    auto global_count = ctx.Input<framework::LoDTensor>("global_count");
+    auto local_count_type =
+        framework::TransToProtoVarType(local_count->dtype());
+    auto global_count_type =
+        framework::TransToProtoVarType(global_count->dtype());
+    if (local_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in local_count."));
+    }
+    if (global_count_type != framework::proto::VarType::INT64) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Please use int64 type in global_count."));
+    }
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    const int64_t* cpu_local_count_data;
+    const int64_t* cpu_global_count_data;
+    framework::Tensor cpu_local_count;
+    if (platform::is_cpu_place(local_count->place())) {
+      cpu_local_count_data = local_count->data<int64_t>();
+    } else {
+      framework::TensorCopySync(*local_count, platform::CPUPlace(),
+                                &cpu_local_count);
+      cpu_local_count_data = cpu_local_count.data<int64_t>();
+    }
+    auto global_count_len = 0;
+    framework::Tensor cpu_global_count;
+    if (platform::is_cpu_place(global_count->place())) {
+      cpu_global_count_data = global_count->data<int64_t>();
+      global_count_len = global_count->numel();
+    } else {
+      framework::TensorCopySync(*global_count, platform::CPUPlace(),
+                                &cpu_global_count);
+      cpu_global_count_data = cpu_global_count.data<int64_t>();
+      global_count_len = cpu_global_count.numel();
+    }
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for global scatter op must be non-negative.",
+            ring_id));
+
+    auto place = ctx.GetPlace();
+
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    distributed::ProcessGroup* pg = map->get(ring_id);
+    int nranks = pg->GetSize();
+    auto in_feat = x->dims()[1];
+    auto n_expert = local_count->dims()[0] / nranks;
+    int64_t fwd_count = 0;
+
+    for (auto i = 0; i < global_count_len; ++i) {
+      fwd_count += cpu_global_count_data[i];
+    }
+    framework::DDim out_dims = phi::make_ddim({fwd_count, in_feat});
+    int64_t* expert_ptr = new int64_t[n_expert * nranks];
+    expert_ptr[0] = 0;
+    auto tot_experts = n_expert * nranks;
+    for (auto i = 1; i < tot_experts; ++i) {
+      expert_ptr[i] = expert_ptr[i - 1] + cpu_local_count_data[i - 1];
+    }
+
+    auto recv_ptr = 0;
+    out->mutable_data<T>(out_dims, place);
+
+    for (auto i = 0; i < n_expert; ++i) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+      for (auto j = 0; j < nranks; ++j) {
+        int idx = i + j * n_expert;
+        if (cpu_local_count_data[idx]) {
+          phi::DenseTensor tmp = *x;
+          pg->Send_Partial(tmp, j, expert_ptr[idx] * in_feat,
+                           cpu_local_count_data[idx] * in_feat);
+        }
+        if (cpu_global_count_data[idx]) {
+          pg->Recv_Partial(*out, j, recv_ptr * in_feat,
+                           cpu_global_count_data[idx] * in_feat);
+          recv_ptr += cpu_global_count_data[idx];
+        }
+      }
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+    }
+
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
+
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+template <typename T>
+class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const int rid = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      GlobalScatterProcessGroupFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    } else {
+      GlobalScatterFunctor<phi::GPUContext, T> functor_;
+      functor_(ctx);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
index 52b486aef25c2..aa567a284a6f7 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.h
+++ b/paddle/fluid/operators/collective/global_scatter_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -33,5 +34,15 @@ class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename Context, typename T>
+struct GlobalScatterFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
+template <typename Context, typename T>
+struct GlobalScatterProcessGroupFunctor {
+  void operator()(const framework::ExecutionContext& ctx);
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2918e8501c3d0..402e65f76d5b1 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1182,8 +1182,8 @@ endif()
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_global_gather PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_collective_global_scatter PROPERTIES TIMEOUT 200)
     set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 00294bf6071b3..dbd982947265f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -191,7 +191,8 @@ def check_with_place(self,
                          path_id="0",
                          static_mode="1",
                          check_error_log=False,
-                         need_envs={}):
+                         need_envs={},
+                         eager_mode=True):
         if backend == "nccl" or backend == "bkcl":
             with_gloo = '0'
         else:
@@ -216,6 +217,12 @@ def check_with_place(self,
             required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
+
+        if eager_mode:
+            required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
+        else:
+            required_envs["FLAGS_enable_eager_mode"] = "%d" % 1
+
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
                                                          required_envs)
         np.random.seed(pid0)
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py b/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
index c9dee529c21a1..6809f3970f683 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_global_gather.py
@@ -35,7 +35,16 @@ def test_global_gather_nccl_dygraph(self):
             "collective_global_gather_dygraph.py",
             "global_gather",
             "nccl",
-            static_mode="0")
+            static_mode="0",
+            eager_mode=False)
+
+    def test_global_gather_nccl_dygraph_eager(self):
+        self.check_with_place(
+            "collective_global_gather_dygraph.py",
+            "global_gather",
+            "nccl",
+            static_mode="0",
+            eager_mode=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
index 2b4555de2744d..1485bafa387f5 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_global_scatter.py
@@ -35,7 +35,16 @@ def test_global_scatter_nccl_dygraph(self):
             "collective_global_scatter_dygraph.py",
             "global_scatter",
             "nccl",
-            static_mode="0")
+            static_mode="0",
+            eager_mode=False)
+
+    def test_global_scatter_nccl_dygraph_eager(self):
+        self.check_with_place(
+            "collective_global_scatter_dygraph.py",
+            "global_scatter",
+            "nccl",
+            static_mode="0",
+            eager_mode=True)
 
 
 if __name__ == '__main__':

From a1d87776ac500b1a3c3250dd9897f103515909c6 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Mon, 30 May 2022 10:30:24 +0800
Subject: [PATCH 060/109] rm serial mode in exclusive case (#43073)

---
 .../fluid/tests/unittests/CMakeLists.txt       | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 402e65f76d5b1..a78c820e1e66a 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -620,22 +620,22 @@ endif()
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-    FLAGS_cudnn_deterministic=1 SERIAL)
+    FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
+        FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
-    FLAGS_cudnn_deterministic=1 SERIAL)
+    FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
+        FLAGS_cudnn_deterministic=1)
 py_test_modules(test_install_check MODULES test_install_check ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
+        FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
 py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_static_runner_mnist ENVS
     FLAGS_cudnn_deterministic=1)
@@ -763,19 +763,19 @@ if(WITH_DISTRIBUTE)
         # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
-            bash_test_modules(${TEST_OP} START_BASH dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
+            bash_test_modules(${TEST_OP} START_BASH dist_test.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
             MATH(EXPR dist_ut_port "${dist_ut_port}+20")
             if(dist_ut_port GREATER_EQUAL 22998)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
         # solve it later.
-        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         if (WITH_GLOO)
-            bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+            bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         endif()
         if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
-            bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+            bash_test_modules(test_new_group START_BASH test_new_group.sh LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         endif()
     endif(NOT APPLE)
 endif()

From 4fd334f5cc501d5ef92003d48a3a0b23d5cef33e Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 30 May 2022 10:38:07 +0800
Subject: [PATCH 061/109] CI check Coverage build size (#42145)

---
 paddle/scripts/paddle_build.sh | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2eda74b769c04..efd2de5621604 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2986,20 +2986,11 @@ function build_develop() {
 }
 
 function check_coverage_build() {
-    if [ ! "${buildSize}" ];then
-        echo "build size not found"
-        exit 1
-    fi
-
-    if [ ${WITH_COVERAGE} != "ON" ];then
-        echo "WARNING: check_coverage need to compile with WITH_COVERAGE=ON, but got WITH_COVERAGE=OFF"
-        exit 1
-    fi
-
     rm -f build_size
     curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/build_size
+    curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size
     dev_coverage_build_size=`cat build_size|sed 's#G##g'`
-    pr_coverage_build_size=`echo $buildSize|sed 's#G##g'`
+    pr_coverage_build_size=`cat coverage_build_size|sed 's#G##g'`
 
     diff_coverage_build_size=`echo $(($pr_coverage_build_size - $dev_coverage_build_size))`
 
@@ -3149,7 +3140,6 @@ function main() {
         check_diff_file_for_coverage
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         enable_unused_var_check
-        check_coverage_build
         ;;
       gpu_cicheck_coverage)
         check_approvals_of_unittest 1
@@ -3157,6 +3147,9 @@ function main() {
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
+      check_coverage_build)
+        check_coverage_build
+        ;;
       ci_preciseTest)
         insert_pile_to_h_cu_diff 
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}

From 114a5d214977507c20c2b8f770301e3187f3ab04 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Mon, 30 May 2022 10:39:37 +0800
Subject: [PATCH 062/109] Make data transform inplaced when tensor is on
 GPUPinned (#43055)

* make data transform inplace when tensor is on gpupinned in new dygraph

* fix unittest
---
 paddle/phi/api/lib/data_transform.cc         | 34 ++++++++------------
 paddle/phi/tests/common/test_int_array.cc    |  2 ++
 python/paddle/tests/test_async_read_write.py |  2 ++
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 598559cc4dffb..12f7b8bba5870 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -174,20 +174,6 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   if (!platform::is_cuda_pinned_place(tensor.place())) {
     pool.Get(tensor.place())->Wait();
     pool.Get(dst_place)->Wait();
-  } else if (platform::is_gpu_place(dst_place)) {
-    auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(dst_place));
-    phi::Copy(*dev_ctx, tensor, dst_place, false, &out);
-
-    // Note: This is an empty callback, the only way is to "reference"
-    // tensor, so it will not be destructed until the kernels launched at
-    // current
-    // stream of given place is finished.
-    auto callback = [tensor, dst_place]() {
-      VLOG(4) << "Run callback of tensor:" << &tensor << " at place "
-              << dst_place;
-    };
-    dev_ctx->AddStreamCallback(callback);
-    return out;
   }
 #endif
 
@@ -204,23 +190,31 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
   return out;
 }
 
-phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
+phi::DenseTensor TransformData(phi::DenseTensor* tensor,
                                const phi::TensorArgDef& target_args_def,
                                const TransformFlag& transform_flag) {
-  phi::DenseTensor out = tensor;
+  phi::DenseTensor out = *tensor;
+  bool trans_layout = false;
+  bool trans_dtype = false;
   if (NeedTransformLayout(
-          tensor.layout(), target_args_def.layout, transform_flag)) {
+          tensor->layout(), target_args_def.layout, transform_flag)) {
     out = TransDataLayout(out, target_args_def.layout);
+    trans_layout = true;
   }
 
   if (NeedTransformDataType(
-          tensor.dtype(), target_args_def.dtype, transform_flag)) {
+          tensor->dtype(), target_args_def.dtype, transform_flag)) {
     out = TransDataType(out, target_args_def.dtype);
+    trans_dtype = true;
   }
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
     out = TransDataPlace(out, phi::TransToPhiPlace(target_args_def.backend));
+    if (!trans_layout && !trans_dtype &&
+        tensor->place().GetType() == AllocationType::GPUPINNED) {
+      tensor->ShareBufferWith(out);
+    }
   }
   return out;
 }
@@ -243,7 +237,7 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
       return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
     }
     phi::DenseTensor out =
-        TransformData(dense_tensor, target_args_def, transform_flag);
+        TransformData(&dense_tensor, target_args_def, transform_flag);
     return std::make_shared<phi::DenseTensor>(std::move(out));
   }
   return nullptr;
@@ -279,7 +273,7 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
           *std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in));
     } else {
       pt_tensors->emplace_back(
-          TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
+          TransformData((static_cast<phi::DenseTensor*>(tensor_in.get())),
                         target_args_def,
                         transform_flag));
     }
diff --git a/paddle/phi/tests/common/test_int_array.cc b/paddle/phi/tests/common/test_int_array.cc
index b6c4f2b1ea8e3..a6278ee4a34fc 100644
--- a/paddle/phi/tests/common/test_int_array.cc
+++ b/paddle/phi/tests/common/test_int_array.cc
@@ -25,8 +25,10 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 namespace phi {
diff --git a/python/paddle/tests/test_async_read_write.py b/python/paddle/tests/test_async_read_write.py
index babdf43199dd6..1432063421586 100644
--- a/python/paddle/tests/test_async_read_write.py
+++ b/python/paddle/tests/test_async_read_write.py
@@ -96,7 +96,9 @@ def test_main(self):
         with _test_eager_guard():
             self.func_setUp()
             self.func_test_async_read_empty_offset_and_count()
+            self.func_setUp()
             self.func_test_async_read_success()
+            self.func_setUp()
             self.func_test_async_read_only_1dim()
         self.func_setUp()
         self.func_test_async_read_empty_offset_and_count()

From 849d937b9863e97cc72002284d94099981a9c752 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Mon, 30 May 2022 11:43:28 +0800
Subject: [PATCH 063/109] [fix] addmm supports 1-d input (#42959)

* addmm supports 1-d input

* fix coverage

* fix

* more ut
---
 paddle/phi/infermeta/ternary.cc               |  16 +-
 .../phi/kernels/impl/addmm_grad_kernel_impl.h |   8 +
 paddle/phi/kernels/impl/addmm_kernel_impl.h   |   9 +-
 .../fluid/tests/unittests/test_addmm_op.py    | 141 +++++++++++++++++-
 python/paddle/tensor/math.py                  |  28 ++--
 5 files changed, 177 insertions(+), 25 deletions(-)

diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 58ae6b2058f9b..3c2888cee58c7 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -113,23 +113,23 @@ void AddmmInferMeta(const MetaTensor& input,
                                  "if you put exe.run(startup_program) "
                                  "after optimizer.minimize function."));
   // dim check
-  PADDLE_ENFORCE_EQ(
-      ndim_input,
-      2,
-      errors::InvalidArgument("The input tensor input's dimension must be 2. "
-                              "But received input's dimension = [%s].",
-                              ndim_input));
+  PADDLE_ENFORCE_EQ(ndim_input == 2 || ndim_input == 1,
+                    true,
+                    errors::InvalidArgument(
+                        "The input tensor input's dimension must be 2 or 1. "
+                        "But received input's dimension = [%d].",
+                        ndim_input));
   PADDLE_ENFORCE_EQ(
       ndim_x,
       2,
       errors::InvalidArgument("The input tensor x's dimension must be 2. "
-                              "But received x's dimension = [%s].",
+                              "But received x's dimension = [%d].",
                               ndim_x));
   PADDLE_ENFORCE_EQ(
       ndim_y,
       2,
       errors::InvalidArgument("The input tensor y's dimension must be 2. "
-                              "But received y's dimension = [%s].",
+                              "But received y's dimension = [%d].",
                               ndim_y));
 
   std::vector<int64_t> output_dims;
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
index d5efd22a31daa..9956f07bf0b98 100644
--- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -44,6 +44,10 @@ void AddmmGradKernel(const Context& dev_ctx,
                      DenseTensor* x_grad,
                      DenseTensor* y_grad) {
   auto in_dims = input.dims();
+  if (input.dims().size() == 1) {
+    in_dims = {1, input.dims()[0]};
+    input_grad->Resize(in_dims);
+  }
   int total_elems = 0;
 
   VLOG(3) << "alpha: " << alpha << " beta: " << beta;
@@ -85,6 +89,10 @@ void AddmmGradKernel(const Context& dev_ctx,
     }
 
     blas.SCAL(total_elems, beta, input_grad->data<T>());
+
+    if (input.dims().size() == 1) {
+      input_grad->Resize(input.dims());
+    }
   }
   if (x_grad) {
     dev_ctx.template Alloc<T>(x_grad);
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
index f7afdfd622e63..3286e31f68923 100644
--- a/paddle/phi/kernels/impl/addmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -44,6 +44,12 @@ void AddmmKernel(const Context& dev_ctx,
   auto x_dims = x.dims();
   auto y_dims = y.dims();
 
+  DenseTensor input_2d(input);
+  if (input.dims().size() == 1) {
+    input_dims = {1, input.dims()[0]};
+    input_2d.Resize(input_dims);
+  }
+
   // broadcast mode check
   if (x_dims[0] != input_dims[0]) {
     PADDLE_ENFORCE_EQ(input_dims[0],
@@ -97,7 +103,8 @@ void AddmmKernel(const Context& dev_ctx,
   bcast_dims[1] = y_dims[1] / input_dims[1];
   VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]";
   // broadcast using eigen
-  auto eigen_input = PhiEigenTensor<T, 2>::From(input);
+  const DenseTensor& const_ref_input = input_2d;
+  auto eigen_input = PhiEigenTensor<T, 2>::From(const_ref_input);
   auto eigen_out = PhiEigenTensor<T, 2>::From(*out);
   auto& place = *dev_ctx.eigen_device();
   funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py
index dcf07f4953200..bea7588acd3d0 100644
--- a/python/paddle/fluid/tests/unittests/test_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py
@@ -221,7 +221,44 @@ def test_check_grad_input(self):
         self.check_grad(['Input'], 'Out', no_grad_set=None)
 
 
-class TestAddMMOp4(unittest.TestCase):
+class TestAddMMOp4(OpTest):
+    # test broadcast
+    def setUp(self):
+        self.op_type = "addmm"
+        self.dtype = np.float64
+        self.init_dtype_type()
+        self.inputs = {
+            'Input': np.random.random((100)).astype(self.dtype),
+            'X': np.random.random((20, 10)).astype(self.dtype),
+            'Y': np.random.random((10, 100)).astype(self.dtype),
+        }
+        self.attrs = {
+            'Alpha': 0.5,
+            'Beta': 2.0,
+        }
+        self.outputs = {'Out': self.attrs['Beta'] * self.inputs['Input'] + \
+                        self.attrs['Alpha'] * np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['Input', 'X', 'Y'], 'Out')
+
+    def test_check_grad_x(self):
+        self.check_grad(['X'], 'Out', no_grad_set=None)
+
+    def test_check_grad_y(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=None)
+
+    def test_check_grad_input(self):
+        self.check_grad(['Input'], 'Out', no_grad_set=None)
+
+
+class TestAddMMOp5(unittest.TestCase):
     def test_api_with_dygraph(self):
         np_input = np.random.random((20, 30)).astype(np.float32)
         np_x = np.random.random((20, 6)).astype(np.float32)
@@ -235,7 +272,6 @@ def test_api_with_dygraph(self):
             assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy())
 
 
-'''
 class TestAddMMAPI(unittest.TestCase):
     def test_api_error(self):
         data_x = np.ones((2, 2)).astype(np.float32)
@@ -249,9 +285,106 @@ def test_error1():
             x = paddle.to_tensor(data_x_wrong)
             y = paddle.to_tensor(data_y)
             input = paddle.to_tensor(data_input)
-            out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
         self.assertRaises(ValueError, test_error1)
-'''
+
+        def test_error2():
+            data_x_wrong = np.ones((2)).astype(np.float32)
+            x = paddle.to_tensor(data_x_wrong)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+        self.assertRaises(ValueError, test_error2)
+
+        def test_error3():
+            data_input_wrong = np.ones((2, 2, 2)).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input_wrong)
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+        self.assertRaises(ValueError, test_error3)
+
+        def test_error4():
+            data_input_wrong = np.ones((5)).astype(np.float32)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input_wrong)
+            out = paddle.tensor.addmm(
+                input=input, x=x, y=y, beta=0.5, alpha=5.0)
+
+        self.assertRaises(ValueError, test_error4)
+
+        paddle.enable_static()
+
+    def test_api_normal_1(self):
+        data_x = np.ones((2, 2)).astype(np.float32)
+        data_y = np.ones((2, 2)).astype(np.float32)
+        data_input = np.ones((2, 2)).astype(np.float32)
+        data_alpha = 0.1
+        data_beta = 1.0
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(data_x)
+        y = paddle.to_tensor(data_y)
+        input = paddle.to_tensor(data_input)
+        paddle_output = paddle.tensor.addmm(
+            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
+                                                                    data_y)
+
+        self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
+
+        paddle.enable_static()
+
+    def test_api_normal_2(self):
+        data_x = np.ones((3, 10)).astype(np.float32)
+        data_y = np.ones((10, 3)).astype(np.float32)
+        data_input = np.ones((3)).astype(np.float32)
+        data_alpha = 0.1
+        data_beta = 1.0
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(data_x)
+        y = paddle.to_tensor(data_y)
+        input = paddle.to_tensor(data_input)
+        paddle_output = paddle.tensor.addmm(
+            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
+                                                                    data_y)
+
+        self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
+
+        paddle.enable_static()
+
+    def test_api_normal_3(self):
+        data_x = np.ones((3, 10)).astype(np.float32)
+        data_y = np.ones((10, 3)).astype(np.float32)
+        data_input = np.ones((1)).astype(np.float32)
+        data_alpha = 0.1
+        data_beta = 1.0
+
+        paddle.disable_static()
+
+        x = paddle.to_tensor(data_x)
+        y = paddle.to_tensor(data_y)
+        input = paddle.to_tensor(data_input)
+        paddle_output = paddle.tensor.addmm(
+            input=input, x=x, y=y, beta=data_beta, alpha=data_alpha)
+        numpy_output = data_beta * data_input + data_alpha * np.dot(data_x,
+                                                                    data_y)
+
+        self.assertEqual(np.allclose(numpy_output, paddle_output.numpy()), True)
+
+        paddle.enable_static()
+
 
 if __name__ == "__main__":
     paddle.enable_static()
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 0be79ece01ff9..2ef324395b26a 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1610,20 +1610,24 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
     input_shape = input.shape
     x_shape = x.shape
     y_shape = y.shape
-    if not len(input_shape) == len(x_shape) == len(y_shape) == 2:
-        raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape))
-    if input_shape[0] != x_shape[0]:
-        if input_shape[0] != 1:
-            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
-        if input_shape[1] != y_shape[1] and input_shape[1] != 1:
-            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
-    if input_shape[1] != y_shape[1]:
-        if input_shape[1] != 1:
-            raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
-        if input_shape[0] != x_shape[0] and input_shape[0] != 1:
-            raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+    if not len(x_shape) == len(y_shape) == 2:
+        raise ValueError("The dimention of x, y should be 2 but receive x's shape: {}, y's shape: {}".format(x_shape, y_shape))
     if x_shape[1] != y_shape[0]:
         raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape))
+    if len(input_shape) == 2:
+        if input_shape[0] != x_shape[0]:
+            if input_shape[0] != 1:
+                raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
+            if input_shape[1] != y_shape[1] and input_shape[1] != 1:
+                raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+        if input_shape[1] != y_shape[1]:
+            if input_shape[1] != 1:
+                raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
+    elif len(input_shape) == 1:
+        if input_shape[0] not in (y_shape[1], 1):
+            raise ValueError("The input's shape: {} is not broadcastable with [x.shape[0], y.shape[1]]: [{},{}]".format(input_shape, x_shape[0], y_shape[1]))
+    else:
+        raise ValueError("The dimention of input should be 2 or 1 but receive input's shape: {}".format(input_shape))
 
 
 

From cd3d0911038355bdba8a5533960cc99400ae16ee Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Mon, 30 May 2022 12:09:27 +0800
Subject: [PATCH 064/109] [Dy2St]Fix cond_block_grad error when handle no need
 grad vras (#43034)

* Fix cond_block_grad error when handle no need grad vras

* Add comment and UT
---
 .../controlflow/conditional_block_op.cc       | 23 ++++++++----
 .../dygraph_to_static/test_ifelse.py          | 35 +++++++++++++++++++
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index fd06e33a6bb6e..7ffbf1933be37 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -153,7 +153,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
                /* keep_kid_scopes */ false);
 
       AssignLocalGradientToParentScope(dev_place, cur_scope, scope,
-                                       inside_grads, outside_grads);
+                                       inside_grads, outside_grads, inputs);
       return;
     }
 
@@ -165,27 +165,36 @@ class ConditionalBlockGradOp : public ConditionalOp {
       const platform::Place &place, const framework::Scope &cur_scope,
       const framework::Scope &parent_scope,
       const std::vector<std::string> &inside_grads,
-      const std::vector<std::string> &outside_grads) const {
+      const std::vector<std::string> &outside_grads,
+      const std::vector<std::string> &inputs) const {
+    std::vector<std::string> assign_zero_outside_grads;
+    std::vector<std::string> assign_zero_inputs;
     for (size_t i = 0; i < outside_grads.size(); ++i) {
       const std::string &outside_grad_name = outside_grads[i];
       const std::string &inside_grad_name = inside_grads[i];
       VLOG(4) << "inside_grad_name = " << inside_grad_name
               << ", outside_grad_name = " << outside_grad_name;
-      framework::Variable *inside_var =
-          cur_scope.FindLocalVar(inside_grad_name);
-      if (inside_var == nullptr) {
-        continue;
-      }
       framework::Variable *outside_var =
           parent_scope.FindVar(outside_grad_name);
       if (outside_var == nullptr) {
         continue;
       }
+      framework::Variable *inside_var =
+          cur_scope.FindLocalVar(inside_grad_name);
+      if (inside_var == nullptr) {
+        assign_zero_outside_grads.emplace_back(outside_grad_name);
+        assign_zero_inputs.emplace_back(inputs[i]);
+        continue;
+      }
       platform::DeviceContext *dev_ctx =
           platform::DeviceContextPool::Instance().Get(place);
       framework::VisitVarType(*inside_var,
                               AssignFunctor(outside_var, *dev_ctx));
     }
+    // Assign zero to the grad_vars that are in outside_grads but not in
+    // inside_grads
+    AssignZeroToParentScope(place, parent_scope, assign_zero_inputs,
+                            assign_zero_outside_grads);
   }
 
   void AssignZeroToParentScope(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 9a9e7ee243872..276aa68e895c6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -424,6 +424,41 @@ def test_ast_to_func(self):
         ProgramTranslator().enable(False)
 
 
+class IfElseNet(paddle.nn.Layer):
+    def __init__(self):
+        super(IfElseNet, self).__init__()
+        self.param = self.create_parameter(
+            shape=[3, 2], dtype='float32', is_bias=False)
+
+    @paddle.jit.to_static
+    def forward(self, a, b, c):
+        a = paddle.matmul(a, self.param)
+        a = paddle.reshape(a, (2, 4))
+        cond = paddle.to_tensor([10])
+        if cond == 10:
+            a_argmax = a.argmax(axis=-1)
+            b = b + self.param
+        else:
+            print(c)
+        return b
+
+
+class TestDy2StIfElseBackward(unittest.TestCase):
+    def test_run_backward(self):
+        a = paddle.randn((4, 3), dtype='float32')
+        a.stop_gradient = False
+        b = paddle.to_tensor([10]).astype('float32')
+        b.stop_gradient = False
+        c = paddle.to_tensor([2])
+        c.stop_gradient = False
+
+        net = IfElseNet()
+        net.train()
+        out = net(a, b, c)
+        out.backward()
+        self.assertTrue(np.allclose((b + net.param).numpy(), out.numpy()))
+
+
 if __name__ == '__main__':
     with paddle.fluid.framework._test_eager_guard():
         unittest.main()

From 3591a2528038b17b90390ea7bdb8c0e5eabee7d9 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 30 May 2022 14:16:01 +0800
Subject: [PATCH 065/109] cant just exit, because the new api has no doc in
 develop;test=document_fix (#43083)

---
 tools/sampcd_processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 13005350d7bd5..1bd9f029d552c 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -342,7 +342,7 @@ def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
             logger.error(
                 "Error: No sample code found! Please check if the API comment contais string 'Examples:' correctly"
             )
-            exit(1)
+            return []
 
     sample_code_filenames = []
     for y, cb in enumerate(codeblocks):

From 806073d6b765a15cd14cab31521973c7cf8456d6 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Mon, 30 May 2022 14:23:40 +0800
Subject: [PATCH 066/109] Optimize memcpy operation in Eigh  (#42853)

* 1st commit

* fix usless change in header transpose_kernel_h file

* add sync
---
 .../kernels/funcs/values_vectors_functor.h    | 86 +++++++++++--------
 1 file changed, 50 insertions(+), 36 deletions(-)

diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 336e9c809427c..a6a6d4097030b 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -27,10 +27,10 @@
 namespace phi {
 namespace funcs {
 
-inline int64_t GetBatchSize(phi::DDim dims) {
+inline int64_t GetBatchSize(const phi::DDim &dims) {
   int64_t batch_size = 1;
   auto dim_size = dims.size();
-  for (int i = 0; i < dim_size - 2; i++) {
+  for (int i = 0; i < dim_size - 2; ++i) {
     batch_size *= dims[i];
   }
   return batch_size;
@@ -54,6 +54,24 @@ static void CheckEighResult(const int batch, const int info) {
           info));
 }
 
+#ifdef PADDLE_WITH_CUDA
+static void CheckEighResult(const GPUContext &dev_ctx,
+                            const int64_t batch_size,
+                            int *info) {
+  std::vector<int> error_info(batch_size);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  dev_ctx.Wait();
+  for (auto i = 0; i < batch_size; ++i) {
+    CheckEighResult(i, error_info[i]);
+  }
+}
+#endif
+
 template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
   void operator()(const DeviceContext &dev_ctx,
@@ -95,7 +113,8 @@ struct MatrixEighFunctor<CPUContext, T> {
     char jobz = has_vectors ? 'V' : 'N';
     int n = dims[dim_size - 1];
     int64_t lda = std::max<int64_t>(1, n);
-    // if work = -1, it means that you need to use the lapack function to query
+    // if work = -1, it means that you need to use the lapack function to
+    // query
     // the optimal value
     int lwork = -1;      // The length of the array work
     int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
@@ -188,97 +207,92 @@ struct MatrixEighFunctor<GPUContext, T> {
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
-    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
 
-    DenseTensor input_trans;
-    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
-    T *input_vector = input_trans.data<T>();
+    int workspace_size = 0;
     auto &dims = input.dims();
     int dim_size = dims.size();
     int64_t batch_size = GetBatchSize(dims);
+    int last_dim = dims[dim_size - 1];
+    int lda = std::max<int>(1, last_dim);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
 
     cublasFillMode_t uplo =
         is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
     cusolverEigMode_t jobz =
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
-    int n = dims[dim_size - 1];
-    int lda = std::max<int>(1, n);
-    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
-    auto values_stride = dims[dim_size - 1];
-    int lwork = 0;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
     auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
     auto *info_ptr = reinterpret_cast<int *>(info->ptr());
 
-    // When the input type is float32, and the feature value input dimension
-    // is greater than or equal to [*,32,32]  and less than or equal to
-    // [*,512,512], Syevj has better performance.
+    DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    // Once input data type is float32, and the last dimension of
+    // input is located in range [32, 512], Syevj works better.
     bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
                       values_stride >= 32 && values_stride <= 512);
+    auto handle = dev_ctx.cusolver_dn_handle();
+
     syevjInfo_t syevj_params;
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+
       PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
           dev_ctx.cusolver_dn_handle(),
           jobz,
           uplo,
-          n,
+          last_dim,
           reinterpret_cast<const float *>(input_vector),
           lda,
           reinterpret_cast<const float *>(out_value),
-          &lwork,
+          &workspace_size,
           syevj_params));
     } else {
       EvdBuffer(dev_ctx.cusolver_dn_handle(),
                 jobz,
                 uplo,
-                n,
+                last_dim,
                 input_vector,
                 lda,
                 out_value,
-                &lwork);
+                &workspace_size);
     }
-    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size);
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
-    for (auto i = 0; i < batch_size; i++) {
+
+    for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;
       auto *value_data = out_value + i * values_stride;
-      auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
         PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cusolverDnSsyevj(handle,
                                       jobz,
                                       uplo,
-                                      n,
+                                      last_dim,
                                       reinterpret_cast<float *>(input_data),
                                       lda,
                                       reinterpret_cast<float *>(value_data),
                                       reinterpret_cast<float *>(work_ptr),
-                                      lwork,
-                                      info_ptr,
+                                      workspace_size,
+                                      &info_ptr[i],
                                       syevj_params));
       } else {
         Evd(handle,
             jobz,
             uplo,
-            n,
+            last_dim,
             input_data,
             lda,
             value_data,
             work_ptr,
-            lwork,
-            info_ptr);
+            workspace_size,
+            &info_ptr[i]);
       }
-      int error_info = 0;
-      paddle::memory::Copy(phi::CPUPlace(),
-                           &error_info,
-                           dev_ctx.GetPlace(),
-                           info_ptr,
-                           sizeof(int),
-                           dev_ctx.stream());
-      CheckEighResult(i, error_info);
     }
+    CheckEighResult(dev_ctx, batch_size, info_ptr);
 
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(

From 4b9e9949e24b54d68d360f425707237cb428029e Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 30 May 2022 14:23:55 +0800
Subject: [PATCH 067/109] fix build error on Sunway, test=develop (#43071)

---
 cmake/cblas.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 43c2208182a55..92a526a2b58a7 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -52,6 +52,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
   set(OPENBLAS_INCLUDE_SEARCH_PATHS
           ${OPENBLAS_ROOT}/include
           /usr/include
+          /usr/include/lapacke
           /usr/include/openblas
           /usr/local/opt/openblas/include)
   set(OPENBLAS_LIB_SEARCH_PATHS
@@ -75,7 +76,7 @@ if(NOT DEFINED CBLAS_PROVIDER)
     string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
     string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
     
-    if (${ver} VERSION_GREATER_EQUAL "0.3.7")
+    if (${ver} VERSION_GREATER_EQUAL "0.3.5")
       set(CBLAS_PROVIDER OPENBLAS)
       set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
       set(CBLAS_LIBRARIES ${OPENBLAS_LIB})

From 2d6dd55f8148ceb8c136b0a8d18d4f50713667e1 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 30 May 2022 15:39:54 +0800
Subject: [PATCH 068/109] Update Coverage docker (#43078)

---
 tools/dockerfile/ci_dockerfile.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index ed13ca8762500..485bfd7968f05 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -20,7 +20,7 @@ function make_ubuntu_dockerfile(){
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
-     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+     tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
   sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
     tar -xvf git-2.17.1.tar.gz \&\& \
     cd git-2.17.1 \&\& \
@@ -38,7 +38,7 @@ function make_ubuntu_dockerfile(){
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
   sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
     RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 

From 586f9429bb3a9086a0f66279c5883b27fb31f293 Mon Sep 17 00:00:00 2001
From: cambriconhsq <106155938+cambriconhsq@users.noreply.github.com>
Date: Mon, 30 May 2022 15:48:46 +0800
Subject: [PATCH 069/109] [MLU]add mlu kernel for log_softmax op (#43040)

---
 paddle/fluid/operators/softmax_op_mlu.cc      |  25 ++-
 .../unittests/mlu/test_log_softmax_op_mlu.py  | 163 ++++++++++++++++++
 2 files changed, 180 insertions(+), 8 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py

diff --git a/paddle/fluid/operators/softmax_op_mlu.cc b/paddle/fluid/operators/softmax_op_mlu.cc
index 9cb698e94fc56..9b97e779f29ef 100644
--- a/paddle/fluid/operators/softmax_op_mlu.cc
+++ b/paddle/fluid/operators/softmax_op_mlu.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
 class SoftmaxMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -45,7 +45,7 @@ class SoftmaxMLUKernel : public framework::OpKernel<T> {
       regard_in_shape = {d1, d2, d3};
     }
 
-    static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
     MLUCnnlTensorDesc in_desc(cnnl_softmax_dims, regard_in_shape.data(),
                               ToCnnlDataType<T>());
     MLUCnnl::SoftmaxForward(ctx, algo, mode, NULL, in_desc.get(),
@@ -54,7 +54,7 @@ class SoftmaxMLUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
 class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -82,7 +82,7 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
       regard_out_shape = {d1, d2, d3};
     }
 
-    static const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
+    static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
     MLUCnnlTensorDesc out_desc(cnnl_softmax_dims, regard_out_shape.data(),
                                ToCnnlDataType<T>());
     MLUCnnl::SoftmaxBackward(ctx, algo, mode, out_desc.get(), GetBasePtr(out),
@@ -97,7 +97,16 @@ class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_MLU_KERNEL(softmax, ops::SoftmaxMLUKernel<float>,
-                       ops::SoftmaxMLUKernel<plat::float16>);
-REGISTER_OP_MLU_KERNEL(softmax_grad, ops::SoftmaxGradMLUKernel<float>,
-                       ops::SoftmaxGradMLUKernel<paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    softmax, ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
+    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
+REGISTER_OP_MLU_KERNEL(softmax_grad,
+                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
+                       ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE,
+                                                 paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(
+    log_softmax, ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, float>,
+    ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
+REGISTER_OP_MLU_KERNEL(
+    log_softmax_grad, ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, float>,
+    ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, paddle::platform::float16>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
new file mode 100644
index 0000000000000..dea6391b8bae0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_log_softmax_op_mlu.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+
+np.random.seed(10)
+paddle.enable_static()
+
+
+def ref_log_softmax(x):
+    shiftx = (x - np.max(x))
+    out = shiftx - np.log(np.exp(shiftx).sum())
+    return out
+
+
+def ref_log_softmax_grad(x, axis):
+    if axis < 0:
+        axis += len(x.shape)
+    out = np.apply_along_axis(ref_log_softmax, axis, x)
+    axis_dim = x.shape[axis]
+    dout = np.full_like(x, fill_value=1. / x.size)
+    dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat(
+        axis_dim, axis=axis)
+    return dx
+
+
+class TestLogSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.set_mlu()
+        self.python_api = F.log_softmax
+        self.dtype = 'float32'
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+        self.set_attrs()
+
+        x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis}
+
+    def set_attrs(self):
+        pass
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
+class TestLogSoftmaxShape(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.shape = [12, 10]
+
+
+class TestLogSoftmaxAxis(TestLogSoftmaxOp):
+    def set_attrs(self):
+        self.axis = 1
+
+
+class TestNNLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_mlu()
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1., 1., self.x_shape).astype(np.float32)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def check_api(self, axis=-1):
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, self.x)
+
+        logsoftmax = paddle.nn.LogSoftmax(axis)
+        # test static api
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
+            y = logsoftmax(x)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], ref_out))
+
+        # test dygrapg api
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = logsoftmax(x)
+        self.assertTrue(np.allclose(y.numpy(), ref_out))
+        paddle.enable_static()
+
+    def test_check_api(self):
+        for axis in [-1, 1]:
+            self.check_api(axis)
+
+
+class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_mlu()
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def check_api(self, axis=-1, dtype=None):
+        x = self.x.copy()
+        if dtype is not None:
+            x = x.astype(dtype)
+        ref_out = np.apply_along_axis(ref_log_softmax, axis, x)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='x', shape=self.x_shape)
+            y = F.log_softmax(x, axis, dtype)
+            exe = paddle.static.Executor(self.place)
+            out = exe.run(feed={'x': self.x}, fetch_list=[y])
+        self.assertTrue(np.allclose(out[0], ref_out))
+
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = F.log_softmax(x, axis, dtype)
+        self.assertTrue(np.allclose(y.numpy(), ref_out), True)
+        paddle.enable_static()
+
+    def test_check_api(self):
+        for axis in [-1, 1]:
+            self.check_api(axis)
+        self.check_api(-1, 'float32')
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data(name='X1', shape=[100], dtype='int32')
+            self.assertRaises(TypeError, F.log_softmax, x)
+
+            x = paddle.fluid.data(name='X2', shape=[100], dtype='float32')
+            self.assertRaises(TypeError, F.log_softmax, x, dtype='int32')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8cc40f4702c5cf0e8c88b13e17d8461938f7298a Mon Sep 17 00:00:00 2001
From: levi131 <83750468+levi131@users.noreply.github.com>
Date: Mon, 30 May 2022 15:59:08 +0800
Subject: [PATCH 070/109] enhance check for current block and docstring for
 prim2orig interface (#43063)

* enhance check for current block docstring for prim2orig interface

* refine if else syntax
---
 python/paddle/incubate/autograd/primx.py | 34 +++++++++++++++---------
 python/paddle/optimizer/optimizer.py     |  2 ++
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 7a969748208a4..1f5c4f9a5cebb 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -38,8 +38,7 @@ def topo_path(xs, ys, block=None):
         path, the unused variables in `xs`, and the unreached variables in `ys`
     """
 
-    if block is None:
-        block = default_main_program().current_block()
+    block = default_main_program().current_block() if block is None else block
 
     path = []
     backpath = []
@@ -160,11 +159,14 @@ def contain_value(self, value_var):
         return id(value_var) in self.tab.values()
 
 
+# TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform(object):
     """ An object that maintains the state of transformations applied to a 
     primitve program. """
 
     def __init__(self, block):
+        assert block == default_main_program().current_block(
+        ), f'only support transform on current block of main program.'
         self.block = block
         self.vars = self.init_vars(block)
         self.var2dot = VarMap('var2dot', self.vars)
@@ -400,6 +402,7 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
         return ys_bar, xs_bar
 
 
+# TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 def _lower(block, reverse):
     # Some functions which are only used in _lower.
     def bind(args, to_bind, value_table):
@@ -430,10 +433,6 @@ def expand_nested_list(xs):
     # Step1: Do some preparatory work for lower
     lower_fn = _prim2orig if reverse else _orig2prim
     lookup_fn = lookup_prim2orig if reverse else lookup_orig2prim
-    if block is None:
-        program = default_main_program()
-        assert program.num_blocks == 1, "The lower transform is designed to process only one block."
-        block = program.current_block()
 
     value_table = {}
     to_bind = {}
@@ -516,6 +515,7 @@ def orig2prim(block=None):
     """ 
     .. note::
         **This API is ONLY available in the static mode.**
+        **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
     If it is an original operator, it will be transformed into
@@ -523,13 +523,14 @@ def orig2prim(block=None):
     equivalent function.
     
     Args:
-        block(paddle.fluid.framework.Variable|None, optional): The
+        block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-    
-    Returns:
-        None
     """
+
+    block = default_main_program().current_block() if block is None else block
+    assert block == default_main_program().current_block(
+    ), f'block is neither None nor current block of main program'
     _lower(block, reverse=False)
 
 
@@ -538,6 +539,7 @@ def prim2orig(block=None):
     """
     .. note::
         **ONLY available in the static mode.**
+        **Args block must be None or current block of main program.**
 
     All operators in the target block are processed as follows.
     If it is an automatic differential basic operator, it will be
@@ -545,10 +547,10 @@ def prim2orig(block=None):
     equivalent function to support execution.
     
     Args:
-        block(paddle.static.Variable|None, optional): The
+        block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
             process on the current block of main program.
-       
+    
     Examples:
 
         .. code-block:: python
@@ -566,6 +568,10 @@ def prim2orig(block=None):
             if prim_enabled():
                 prim2orig()
     """
+
+    block = default_main_program().current_block() if block is None else block
+    assert block == default_main_program().current_block(
+    ), f'block is neither None nor current block of main program'
     _lower(block, reverse=True)
 
 
@@ -583,7 +589,9 @@ def _gradients(ys, xs, ys_bar=None):
     """
 
     ys, xs = to_tensors(ys), to_tensors(xs)
-    block = ys[0].block
+    block = default_main_program().current_block()
+    for el in xs + ys:
+        assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program'
     # TODO(Tongxin) without any prior knowledge about whether the program
     # is completely lowered to primitive ops, it's mandatory to run the lowering
     # pass once and again. This is obviously inefficient and needs to be 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 9dfec3947e95f..cf180fccc4857 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -58,6 +58,8 @@ def append_backward_new(loss_list,
     program = default_main_program()
     assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
     block = program.current_block()
+    for el in loss_list:
+        assert el.block == block, f'variable in loss_list should be in current block of main program'
 
     orig2prim(block)
     ad = Transform(block)

From 5df922621017f1983d11e76808b8e962d6f1b96d Mon Sep 17 00:00:00 2001
From: huzhiqiang <912790387@qq.com>
Date: Mon, 30 May 2022 16:02:46 +0800
Subject: [PATCH 071/109] [Framework]accelerate inference period (#42400)

---
 paddle/fluid/framework/operator.cc | 42 ++++++++++++++++++++++++++----
 paddle/fluid/framework/operator.h  |  4 +++
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index afd1bf338c45e..7dc885f54ab6c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1116,6 +1116,21 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const RuntimeContext& ctx_;
 };
 
+struct OperatorWithKernel::CacheImpl {
+  explicit CacheImpl(phi::KernelContext* kernel_ctx,
+                     RuntimeInferShapeContext* infer_shape_ctx)
+      : kernel_ctx_(kernel_ctx), infer_shape_ctx_(infer_shape_ctx) {}
+
+  phi::KernelContext* getKernelContext() { return kernel_ctx_.get(); }
+  RuntimeInferShapeContext* getRuntimeInferShapeContext() {
+    return infer_shape_ctx_.get();
+  }
+
+ private:
+  std::unique_ptr<phi::KernelContext> kernel_ctx_;
+  std::unique_ptr<RuntimeInferShapeContext> infer_shape_ctx_;
+};
+
 static void CheckTensorNANOrInf(const std::string& op_type,
                                 const std::string& name,
                                 const framework::Tensor& tensor) {
@@ -1244,6 +1259,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
     pre_scope_ = cur_scope;
+  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
+             !need_prepare_phi_data_) {
+    if (!all_kernels_must_compute_runtime_shape_)
+      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
+    (*pt_kernel_)(impl_->getKernelContext());
   } else {
     if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
       std::lock_guard<std::mutex> lock(cache_update_mutex_);
@@ -1508,12 +1528,22 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
     if (run_phi_kernel_) {
-      phi::KernelContext pt_kernel_context;
-      // Do data transform before building KernelContext
-      // TODO(zhiqiu): support TransferInplaceVarsBack
       PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx);
-      BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
-      (*pt_kernel_)(&pt_kernel_context);
+      if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
+          !need_prepare_data_) {
+        impl_ =
+            new CacheImpl(new phi::KernelContext(),
+                          new RuntimeInferShapeContext(*this, *runtime_ctx));
+        BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
+
+        (*pt_kernel_)(impl_->getKernelContext());
+      } else {
+        phi::KernelContext pt_kernel_context;
+        // Do data transform before building KernelContext
+        // TODO(zhiqiu): support TransferInplaceVarsBack
+        BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+        (*pt_kernel_)(&pt_kernel_context);
+      }
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -2323,6 +2353,8 @@ Scope* OperatorWithKernel::PreparePhiData(
       Tensor out;
       framework::TensorCopySync(*tensor_in, expected_place, &out);
       SetTensorToVariable(*var, out, trans_var);
+
+      need_prepare_phi_data_ = true;
     }
   }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2e00e07535b1d..2efa2e4bd8a75 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -698,6 +698,7 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
   mutable bool need_prepare_data_ = true;
+  mutable bool need_prepare_phi_data_ = false;
   mutable bool enable_cache_runtime_context_ = false;
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
@@ -710,6 +711,9 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<phi::KernelSignature> kernel_signature_;
   mutable std::unique_ptr<phi::Kernel> pt_kernel_;
   mutable std::unique_ptr<phi::ArgumentMappingFn> arg_map_fn_;
+
+  struct CacheImpl;
+  mutable CacheImpl* impl_{nullptr};
 };
 
 extern bool OpSupportGPU(const std::string& op_type);

From f87fa3c0e5d0ebf89b336cf16c4d1eb0b8767b25 Mon Sep 17 00:00:00 2001
From: thunder95 <290844930@qq.com>
Date: Mon, 30 May 2022 16:38:45 +0800
Subject: [PATCH 072/109] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=202?=
 =?UTF-8?q?=E3=80=9115=20=E6=96=B0=E5=A2=9E=20API=20Nanmedian=20(#42385)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* nanmedian op

* 修改cuda kernel的bug

* 修复count_if在其他硬件平台不兼容

* 修复某些cpu硬件不兼容

* 修复某些cpu硬件不兼容

* 修复isnan判断

* 兼容numpy低版本不支持全部nan的情况

* 兼容numpy低版本不支持全部nan的情况

* fix code example

* fix api comment error

* 修改反向传播逻辑以及c++处理逻辑

* 完成修改建议

* typo pre_dim

* update en docs, test=document_fix

* remove numpy in en doc, test=document_fix

* add r,test=document_fix

* 添加api到all

* follow advice from chenwhql
---
 paddle/fluid/operators/nanmedian_op.cc        | 125 ++++++++
 paddle/phi/infermeta/backward.cc              |  11 +
 paddle/phi/infermeta/backward.h               |   7 +
 paddle/phi/infermeta/unary.cc                 |  59 ++++
 paddle/phi/infermeta/unary.h                  |   7 +
 .../phi/kernels/cpu/nanmedian_grad_kernel.cc  |  99 ++++++
 paddle/phi/kernels/cpu/nanmedian_kernel.cc    | 208 +++++++++++++
 .../phi/kernels/gpu/nanmedian_grad_kernel.cu  | 122 ++++++++
 paddle/phi/kernels/gpu/nanmedian_kernel.cu    | 287 ++++++++++++++++++
 paddle/phi/kernels/nanmedian_grad_kernel.h    |  73 +++++
 paddle/phi/kernels/nanmedian_kernel.h         |  75 +++++
 paddle/phi/ops/compat/nanmedian_sig.cc        |  35 +++
 python/paddle/__init__.py                     |   2 +
 .../fluid/tests/unittests/test_nanmedian.py   | 196 ++++++++++++
 python/paddle/tensor/__init__.py              |   2 +
 python/paddle/tensor/stat.py                  |  97 ++++++
 tools/parallel_UT_rule.py                     |   2 +-
 17 files changed, 1406 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/nanmedian_op.cc
 create mode 100644 paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/nanmedian_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/nanmedian_kernel.cu
 create mode 100644 paddle/phi/kernels/nanmedian_grad_kernel.h
 create mode 100644 paddle/phi/kernels/nanmedian_kernel.h
 create mode 100644 paddle/phi/ops/compat/nanmedian_sig.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_nanmedian.py

diff --git a/paddle/fluid/operators/nanmedian_op.cc b/paddle/fluid/operators/nanmedian_op.cc
new file mode 100644
index 0000000000000..23a497bdb1d3d
--- /dev/null
+++ b/paddle/fluid/operators/nanmedian_op.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+class NanmedianOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class NanmedianOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input feature data of NanmedianOp, dtype should be"
+             "int32, int64, float16, float32 or float64.");
+    AddOutput(
+        "MedianIndex",
+        "Store the index position of median values, The calculation differs "
+        "in the odd or even valid elements numbers."
+        "Along the axis, two elements contributed to the median value in "
+        "each row."
+        "If the amount of valid elements were even, both were the same.")
+        .AsIntermediate()
+        .AsExtra();
+    AddOutput("Out",
+              "(Tensor),"
+              " the output of  NanmedianOp, whose dtype is the same as X");
+    AddAttr<bool>("keepdim",
+                  "(bool, default true) "
+                  "If true, retain the reduced axis with length 1.")
+        .SetDefault(true);
+    AddAttr<std::vector<int>>("axis",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to calculate medians")
+        .SetDefault({});
+    AddComment(R"DOC(
+                Nanmedian operator
+
+                This operator is considered as an extention of median operation,
+                which supports specifically the case of NaN values in the input.
+
+                If all the elements in input are NaN it will also return NaN.
+                If no elements in input are Nan, this op is identical to thie median op.
+
+                If the valid count of elements is a even number, the average value of
+                the elements in the middle is calculated as the median.
+
+                This operator can also supports multiple axis.
+        )DOC");
+  }
+};
+
+template <typename T>
+class NanmedianGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("nanmedian_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("MedianIndex", this->Output("MedianIndex"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class NanmedianGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(nanmedian, NanmedianInferShapeFunctor,
+                            PD_INFER_META(phi::NanmedianInferMeta));
+
+REGISTER_OPERATOR(nanmedian, ops::NanmedianOp, ops::NanmedianOpMaker,
+                  ops::NanmedianGradMaker<paddle::framework::OpDesc>,
+                  ops::NanmedianGradMaker<paddle::imperative::OpBase>,
+                  NanmedianInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(nanmedian_grad, NanmedianGradInferShapeFunctor,
+                            PD_INFER_META(phi::NanmedianGradInferMeta));
+
+REGISTER_OPERATOR(nanmedian_grad, ops::NanmedianGradOp,
+                  NanmedianGradInferShapeFunctor);
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 78f8ff9e00ce5..521eb03fd770f 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -433,6 +433,17 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
   }
 }
 
+void NanmedianGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& median_index,
+                            const MetaTensor& out_grad,
+                            const IntArray& axes,
+                            bool keep_dim,
+                            MetaTensor* x_grad) {
+  auto x_dims = x.dims();
+  x_grad->set_dims(x_dims);
+  x_grad->set_dtype(x.dtype());
+}
+
 void NllLossGradInferMeta(const MetaTensor& x,
                           const MetaTensor& label,
                           const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index b52734eb5b10c..93e2d4c43bc3f 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -191,6 +191,13 @@ void MultiplexGradInferMeta(const MetaTensor& ids,
                             const MetaTensor& out_grad,
                             std::vector<MetaTensor*> ins_grad);
 
+void NanmedianGradInferMeta(const MetaTensor& x,
+                            const MetaTensor& median_index,
+                            const MetaTensor& out_grad,
+                            const IntArray& axes,
+                            bool keep_dim,
+                            MetaTensor* x_grad);
+
 void NllLossGradInferMeta(const MetaTensor& input,
                           const MetaTensor& label,
                           const MetaTensor& weight,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 1ec804d1bf822..f736bf50162d8 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1246,6 +1246,65 @@ void MultinomialInferMeta(const MetaTensor& x,
   out->set_dtype(DataType::INT64);
 }
 
+void NanmedianInferMeta(const MetaTensor& x,
+                        const IntArray& axes,
+                        bool keep_dim,
+                        MetaTensor* out,
+                        MetaTensor* median_index) {
+  std::vector<int64_t> axis_list = axes.GetData();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  out->set_dtype(x.dtype());
+  median_index->set_dtype(DataType::INT64);
+  median_index->set_dims(make_ddim({x.numel() * 2}));
+
+  std::vector<int32_t> out_dim;
+  if (axis_list.empty()) {
+    if (keep_dim) {
+      for (int64_t i = 0; i < x_rank; i++) {
+        out_dim.push_back(1);
+      }
+    } else {
+      out_dim.push_back(1);
+    }
+  } else {
+    std::vector<int64_t> cleaned_axis;
+    for (auto& axis : axis_list) {
+      if (axis < 0) axis += x_rank;
+
+      PADDLE_ENFORCE_LT(
+          axis,
+          x_rank,
+          errors::InvalidArgument(
+              "Attr(axis) value should be in range [-R, R-1], R is "
+              "the rank of Input(X). But received axis: %d, R: %d. "
+              "Current Input(X)'s shape is=[%s].",
+              axis,
+              x_rank,
+              x_dim));
+
+      PADDLE_ENFORCE_EQ(
+          std::find(cleaned_axis.begin(), cleaned_axis.end(), axis),
+          cleaned_axis.end(),
+          errors::InvalidArgument("Attr(axes) has duplicated elements: %d.",
+                                  static_cast<int>(axis)));
+
+      cleaned_axis.push_back(axis);
+    }
+
+    for (int64_t i = 0; i < x_rank; i++) {
+      if (std::find(cleaned_axis.begin(), cleaned_axis.end(), i) ==
+          cleaned_axis.end()) {
+        out_dim.push_back(x_dim[i]);
+      } else if (keep_dim) {
+        out_dim.push_back(1);
+      }
+    }
+  }
+
+  out->set_dims(make_ddim(out_dim));
+}
+
 void NormInferMeta(const MetaTensor& x,
                    int axis,
                    float epsilon,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 25ea003f58fd9..c21ef0e2d1103 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -178,6 +178,13 @@ void MultinomialInferMeta(const MetaTensor& x,
                           int num_samples,
                           bool replacement,
                           MetaTensor* out);
+
+void NanmedianInferMeta(const MetaTensor& x,
+                        const IntArray& axes,
+                        bool keep_dim,
+                        MetaTensor* out,
+                        MetaTensor* median_index);
+
 void NormInferMeta(const MetaTensor& x,
                    int axis,
                    float epsilon,
diff --git a/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
new file mode 100644
index 0000000000000..156124c214895
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nanmedian_grad_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CalcMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad,
+                          T* x_grad_ptr) {
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+  if (!x_grad_ptr) return;
+
+  const int64_t* m_ptr = median_index.data<int64_t>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t rank = x_dim.size();
+  int64_t stride = x_dim[rank - 1];
+
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+  int64_t offset = 0;
+  T div_factor = static_cast<T>(2.0);
+  for (i = 0; i < pre_dim; i++) {
+    if (m_ptr[2 * i] >= 0) {
+      if (m_ptr[2 * i] == m_ptr[2 * i + 1]) {
+        x_grad_ptr[offset + m_ptr[2 * i]] = out_grad_ptr[i];
+      } else {
+        x_grad_ptr[offset + m_ptr[2 * i]] = out_grad_ptr[i] / div_factor;
+        x_grad_ptr[offset + m_ptr[2 * i + 1]] = out_grad_ptr[i] / div_factor;
+      }
+    }
+    offset += stride;
+  }
+}
+
+template <typename T, typename Context>
+void BaseMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad) {
+  auto rank = x.dims().size();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  if (axes.size() && (rank > 1)) {
+    DenseTensor tmp_x_grad(*x_grad);
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, axes, &tmp_x_grad, x_grad_ptr);
+    PostprocessMedianGradKernel<T, Context>(dev_ctx, &tmp_x_grad, axes, x_grad);
+  } else {
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, axes, x_grad, x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad) {
+  BaseMedianGradKernel<T, Context>(
+      dev_ctx, input, median_index, out_grad, axes, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/nanmedian_kernel.cc b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
new file mode 100644
index 0000000000000..ed38405c9179f
--- /dev/null
+++ b/paddle/phi/kernels/cpu/nanmedian_kernel.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CalcMedianFunc(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int64_t>& nan_counts,
+                    bool ignore_nan,
+                    int64_t sort_k,
+                    int64_t stride,
+                    int64_t pre_dim,
+                    T* o_ptr,
+                    int64_t* m_ptr) {
+  bool should_ignore_nan = ignore_nan;
+  DenseTensor sort_out;
+  DenseTensor sort_indices;
+  auto sort_dim = x.dims();
+  int64_t rank = sort_dim.size();
+  sort_dim[rank - 1] = sort_k;
+  sort_out.Resize(sort_dim);
+  sort_indices.Resize(sort_dim);
+
+  dev_ctx.template Alloc<T>(&sort_out);
+  T* sort_out_ptr = sort_out.data<T>();
+  dev_ctx.template Alloc<int64_t>(&sort_indices);
+  int64_t* sort_indices_ptr = sort_indices.data<int64_t>();
+
+  TopkKernel<T, Context>(
+      dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices);
+
+  T div_factor = static_cast<T>(2.0);
+  int64_t offset = 0;
+  int64_t i = 0;
+  bool is_ori_odd = stride & 1;
+  if (should_ignore_nan) {
+    for (i = 0; i < pre_dim; i++) {
+      offset = i * sort_k;
+      if (nan_counts[i] == stride) {
+        m_ptr[i * 2] = -1;
+        m_ptr[i * 2 + 1] = -1;
+        o_ptr[i] = sort_out_ptr[offset];
+      } else {
+        int64_t nan_k = nan_counts[i] > 0
+                            ? static_cast<int64_t>(stride - nan_counts[i])
+                            : sort_k;
+        int64_t row_pos = static_cast<int64_t>(nan_k >> 1);
+        int64_t pos = offset + row_pos;
+        if (nan_k & 1) {
+          m_ptr[2 * i] = sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+          o_ptr[i] = sort_out_ptr[pos];
+        } else {
+          m_ptr[2 * i] =
+              row_pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+          m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+          T m_val_left =
+              row_pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+          T m_val_right = sort_out_ptr[pos];
+          o_ptr[i] = (m_val_left + m_val_right) / div_factor;
+        }
+      }
+    }
+  } else {
+    if (is_ori_odd) {
+      for (i = 0; i < pre_dim; i++) {
+        offset = i * sort_k;
+        int64_t pos = offset + sort_k - 1;
+        o_ptr[i] = sort_out_ptr[pos];
+        m_ptr[2 * i] = sort_indices_ptr[pos];
+        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+      }
+    } else {
+      for (i = 0; i < pre_dim; i++) {
+        offset = i * sort_k;
+        int64_t pos = offset + sort_k - 1;
+        m_ptr[2 * i] =
+            sort_k > 1 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+        m_ptr[2 * i + 1] = sort_indices_ptr[pos];
+        T m_val_left = sort_k > 1 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+        T m_val_right = sort_out_ptr[pos];
+        o_ptr[i] = (m_val_left + m_val_right) / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ProcessMedianKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         T* o_ptr,
+                         int64_t* m_ptr,
+                         bool ignore_nan) {
+  bool should_ignore_nan = ignore_nan;
+  const T* x_ptr = x.data<T>();
+
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+
+  int64_t max_valid_num = 0;
+  std::vector<int64_t> nan_counts;
+  if (should_ignore_nan) {
+    int64_t total_nan_num = 0;
+    std::vector<T> col_vec;
+    col_vec.reserve(stride);
+    col_vec.resize(stride);
+    nan_counts.clear();
+    nan_counts.reserve(pre_dim);
+    nan_counts.resize(pre_dim);
+    for (int64_t i = 0; i < pre_dim; i++) {
+      col_vec.clear();
+      col_vec.insert(
+          col_vec.begin(), x_ptr + i * stride, x_ptr + (i + 1) * stride);
+      nan_counts[i] =
+          std::count_if(col_vec.begin(), col_vec.end(), [&](const T& val) {
+            return std::isnan(static_cast<float>(val));
+          });
+      total_nan_num += nan_counts[i];
+      if (stride - nan_counts[i] > max_valid_num)
+        max_valid_num = stride - nan_counts[i];
+    }
+    // all elems are nan
+    if (total_nan_num == numel) {
+      for (i = 0; i < pre_dim; i++) {
+        o_ptr[i] = x_ptr[0];
+        m_ptr[2 * i] = -1;
+        m_ptr[2 * i + 1] = -1;
+      }
+      return;
+    }
+    should_ignore_nan = total_nan_num > 0;
+  }
+
+  int64_t sort_k = should_ignore_nan ? max_valid_num : ((stride >> 1) + 1);
+  CalcMedianFunc<T, Context>(dev_ctx,
+                             x,
+                             nan_counts,
+                             should_ignore_nan,
+                             sort_k,
+                             stride,
+                             pre_dim,
+                             o_ptr,
+                             m_ptr);
+}
+
+template <typename T, typename Context>
+void BaseMedianKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const IntArray& axes,
+                      DenseTensor* out,
+                      DenseTensor* median_index,
+                      bool ignore_nan) {
+  DenseTensor x;
+  auto rank = input.dims().size();
+  if ((axes.size() == 0) || rank <= 1) {
+    x = input;
+    x.Resize({input.numel()});
+  } else {
+    PreprocessMedianKernel<T, Context>(dev_ctx, input, axes, &x);
+  }
+
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  int64_t* m_ptr = dev_ctx.template Alloc<int64_t>(median_index);
+  ProcessMedianKernel<T, Context>(dev_ctx, x, o_ptr, m_ptr, ignore_nan);
+  out->Resize(out->dims());
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keepdim,
+                     DenseTensor* out,
+                     DenseTensor* median_index) {
+  BaseMedianKernel<T, Context>(dev_ctx, x, axes, out, median_index, true);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
new file mode 100644
index 0000000000000..a7cd49c0e53f3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nanmedian_grad_kernel.cu
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/nanmedian_grad_kernel.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void KernelNanmedianGrad(const T* x_ptr,
+                                    const int64_t* medians_ptr,
+                                    const T* out_grad_ptr,
+                                    T* x_grad_ptr,
+                                    int64_t stride,
+                                    int64_t pre_dim,
+                                    T div_factor) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t offset = index * stride;
+    if (medians_ptr[2 * index] >= 0) {
+      if (medians_ptr[2 * index] == medians_ptr[2 * index + 1]) {
+        x_grad_ptr[offset + medians_ptr[2 * index]] = out_grad_ptr[index];
+      } else {
+        x_grad_ptr[offset + medians_ptr[2 * index]] =
+            out_grad_ptr[index] / div_factor;
+        x_grad_ptr[offset + medians_ptr[2 * index + 1]] =
+            out_grad_ptr[index] / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void CalcMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          DenseTensor* x_grad,
+                          T* x_grad_ptr) {
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(dev_ctx, x_grad, static_cast<T>(0));
+
+  auto stream = dev_ctx.stream();
+  const T* x_ptr = x.data<T>();
+  const int64_t* m_ptr = median_index.data<int64_t>();
+  const T* out_grad_ptr = out_grad.data<T>();
+
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+
+  T div_factor = static_cast<T>(2.0);
+  KernelNanmedianGrad<
+      T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+      x_ptr, m_ptr, out_grad_ptr, x_grad_ptr, stride, pre_dim, div_factor);
+}
+
+template <typename T, typename Context>
+void BaseMedianGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& median_index,
+                          const DenseTensor& out_grad,
+                          const IntArray& axes,
+                          DenseTensor* x_grad) {
+  auto rank = x.dims().size();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  if (axes.size() && (rank > 1)) {
+    DenseTensor tmp_x_grad(*x_grad);
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, &tmp_x_grad, x_grad_ptr);
+    PostprocessMedianGradKernel<T, Context>(dev_ctx, &tmp_x_grad, axes, x_grad);
+  } else {
+    CalcMedianGradKernel<T, Context>(
+        dev_ctx, x, median_index, out_grad, x_grad, x_grad_ptr);
+  }
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& input,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad) {
+  BaseMedianGradKernel<T, Context>(
+      dev_ctx, input, median_index, out_grad, axes, x_grad);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/nanmedian_kernel.cu b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
new file mode 100644
index 0000000000000..5975e2748997e
--- /dev/null
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -0,0 +1,287 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/nanmedian_kernel.h"
+#include "paddle/phi/kernels/top_k_kernel.h"
+
+namespace phi {
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+
+inline int GET_BLOCKS(const int N) {
+  return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
+}
+
+template <typename T>
+__global__ void KernelNanCounts(const T* input,
+                                const int numel,
+                                const int64_t pre_dim,
+                                const int64_t stride,
+                                T min_val,
+                                int64_t* nan_total,
+                                int64_t* nan_counts) {
+  extern __shared__ int64_t buf[];
+  for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
+    buf[i] = 0;
+    nan_counts[i] = 0;
+  }
+
+  if (threadIdx.x == 0) {
+    nan_total[0] = 0;
+    nan_total[1] = 0;
+  }
+
+  __syncthreads();
+
+  CUDA_KERNEL_LOOP(index, numel) {
+    const T x = input[index];
+    if (isnan(static_cast<float>(x))) {
+      auto bin = static_cast<int64_t>(index / stride);
+      paddle::platform::CudaAtomicAdd(&buf[bin], 1);
+    }
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < pre_dim; i += blockDim.x) {
+    paddle::platform::CudaAtomicAdd(&nan_counts[i], buf[i]);
+    paddle::platform::CudaAtomicAdd(&nan_total[0], buf[i]);
+    paddle::platform::CudaAtomicMax(&nan_total[1], stride - buf[i]);
+  }
+}
+
+template <typename T>
+__global__ void CalcMedianKernel(const T* sort_out_ptr,
+                                 const int64_t* sort_indices_ptr,
+                                 int64_t* median_val,
+                                 T* output,
+                                 T div_factor,
+                                 const bool is_odd,
+                                 const int64_t pre_dim,
+                                 const int64_t stride) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t pos = static_cast<int64_t>((index + 1) * stride) - 1;
+    if (is_odd) {
+      median_val[index * 2] = sort_indices_ptr[pos];
+      median_val[index * 2 + 1] = sort_indices_ptr[pos];
+      output[index] = sort_out_ptr[pos];
+    } else {
+      median_val[index * 2] =
+          pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+      median_val[index * 2 + 1] = sort_indices_ptr[pos];
+      T median_val_left = pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+      T median_val_right = sort_out_ptr[pos];
+      output[index] = (median_val_left + median_val_right) / div_factor;
+    }
+  }
+}
+
+template <typename T>
+__global__ void CalcNanmedianKernel(const T* sort_out_ptr,
+                                    const int64_t* sort_indices_ptr,
+                                    int64_t* nan_counts,
+                                    int64_t* median_val,
+                                    T* output,
+                                    const bool is_odd,
+                                    const int64_t pre_dim,
+                                    const int64_t max_valid_num,
+                                    const int64_t stride,
+                                    const T div_factor,
+                                    const T nan_val) {
+  CUDA_KERNEL_LOOP(index, pre_dim) {
+    int64_t pos = static_cast<int64_t>(index * max_valid_num);
+    int64_t nan_cnt = nan_counts[index];
+    if (nan_cnt == stride) {
+      median_val[index * 2] = -1;
+      median_val[index * 2 + 1] = -1;
+      output[index] = nan_val;
+    } else {
+      int64_t nan_k =
+          nan_cnt > 0 ? static_cast<int64_t>(stride - nan_cnt) : max_valid_num;
+      int64_t row_pos = static_cast<int64_t>(nan_k >> 1);
+      pos += row_pos;
+
+      if (nan_k & 1) {
+        median_val[index * 2] = sort_indices_ptr[pos];
+        median_val[index * 2 + 1] = sort_indices_ptr[pos];
+        output[index] = sort_out_ptr[pos];
+      } else {
+        median_val[index * 2] =
+            pos > 0 ? sort_indices_ptr[pos - 1] : sort_indices_ptr[pos];
+        median_val[index * 2 + 1] = sort_indices_ptr[pos];
+        T median_val_left = pos > 0 ? sort_out_ptr[pos - 1] : sort_out_ptr[pos];
+        T median_val_right = sort_out_ptr[pos];
+        output[index] = (median_val_left + median_val_right) / div_factor;
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ProcessMedianKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         bool ignore_nan,
+                         DenseTensor* out,
+                         int64_t* m_ptr) {
+  bool should_ignore_nan = ignore_nan;
+  auto stream = dev_ctx.stream();
+
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  int64_t numel = x.numel();
+  auto x_dim = x.dims();
+  int64_t x_rank = x_dim.size();
+  int64_t stride = x_dim[x_rank - 1];
+  int64_t pre_dim = numel / stride;
+  int64_t i = 0;
+
+  DenseTensor nan_counts, nan_stat;
+  int64_t* nan_counts_ptr;
+  int64_t max_valid_num = 0;
+  if (should_ignore_nan) {
+    nan_counts.Resize(phi::make_ddim({pre_dim}));
+    dev_ctx.template Alloc<int64_t>(&nan_counts);
+    nan_counts_ptr = nan_counts.data<int64_t>();
+    nan_stat.Resize(phi::make_ddim({2}));
+    int64_t* nan_stat_mem = dev_ctx.template Alloc<int64_t>(&nan_stat);
+    int64_t* nan_stat_ptr = nan_stat.data<int64_t>();
+
+    KernelNanCounts<T><<<GET_BLOCKS(numel),
+                         PADDLE_CUDA_NUM_THREADS,
+                         pre_dim * sizeof(int64_t),
+                         stream>>>(x_ptr,
+                                   numel,
+                                   pre_dim,
+                                   stride,
+                                   std::numeric_limits<T>::min(),
+                                   nan_stat_ptr,
+                                   nan_counts_ptr);
+
+    auto nan_stat_mem_cpu =
+        paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
+    int64_t* nan_stat_cpu_ptr =
+        reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
+    paddle::memory::Copy(phi::CPUPlace(),
+                         nan_stat_cpu_ptr,
+                         dev_ctx.GetPlace(),
+                         nan_stat_mem,
+                         sizeof(int64_t) * 2,
+                         stream);
+
+    // all elements are nan values
+    T nan_val = std::numeric_limits<T>::quiet_NaN();
+    if (nan_stat_cpu_ptr[0] == numel) {
+      FullLikeKernel<T, Context>(dev_ctx, x, nan_val, x.dtype(), out);
+      return;
+    }
+
+    should_ignore_nan = nan_stat_cpu_ptr[0] > 0;
+    max_valid_num = nan_stat_cpu_ptr[1];
+  }
+
+  int64_t sort_k = should_ignore_nan ? max_valid_num : ((stride >> 1) + 1);
+  bool is_ori_odd = stride & 1;
+
+  DenseTensor sort_out, sort_indices;
+  auto sort_dim = x.dims();
+  int64_t rank = sort_dim.size();
+  sort_dim[rank - 1] = sort_k;
+  sort_out.Resize(sort_dim);
+  sort_indices.Resize(sort_dim);
+
+  dev_ctx.template Alloc<T>(&sort_out);
+  T* sort_out_ptr = sort_out.data<T>();
+  dev_ctx.template Alloc<int64_t>(&sort_indices);
+  int64_t* sort_indices_ptr = sort_indices.data<int64_t>();
+
+  TopkKernel<T, Context>(
+      dev_ctx, x, Scalar(sort_k), -1, false, true, &sort_out, &sort_indices);
+
+  T div_factor = static_cast<T>(2.0);
+  T nan_val = std::numeric_limits<T>::quiet_NaN();
+  if (should_ignore_nan) {
+    CalcNanmedianKernel<
+        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        sort_out_ptr,
+        sort_indices_ptr,
+        nan_counts_ptr,
+        m_ptr,
+        o_ptr,
+        is_ori_odd,
+        pre_dim,
+        max_valid_num,
+        stride,
+        div_factor,
+        nan_val);
+  } else {
+    CalcMedianKernel<
+        T><<<GET_BLOCKS(pre_dim), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        sort_out_ptr,
+        sort_indices_ptr,
+        m_ptr,
+        o_ptr,
+        div_factor,
+        is_ori_odd,
+        pre_dim,
+        sort_k);
+  }
+}
+
+template <typename T, typename Context>
+void BaseMedianKernel(const Context& dev_ctx,
+                      const DenseTensor& input,
+                      const IntArray& axes,
+                      bool ignore_nan,
+                      DenseTensor* out,
+                      DenseTensor* median_index) {
+  DenseTensor x;
+  auto rank = input.dims().size();
+  if ((axes.size() == 0) || rank <= 1) {
+    x = input;
+    x.Resize({input.numel()});
+  } else {
+    PreprocessMedianKernel<T, Context>(dev_ctx, input, axes, &x);
+  }
+
+  int64_t* m_ptr = dev_ctx.template Alloc<int64_t>(median_index);
+  ProcessMedianKernel<T, Context>(dev_ctx, x, ignore_nan, out, m_ptr);
+  out->Resize(out->dims());
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keepdim,
+                     DenseTensor* out,
+                     DenseTensor* median_index) {
+  BaseMedianKernel<T, Context>(dev_ctx, x, axes, true, out, median_index);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(nanmedian,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::NanmedianKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/nanmedian_grad_kernel.h b/paddle/phi/kernels/nanmedian_grad_kernel.h
new file mode 100644
index 0000000000000..dc7321c1aa751
--- /dev/null
+++ b/paddle/phi/kernels/nanmedian_grad_kernel.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PostprocessMedianGradKernel(const Context& dev_ctx,
+                                 DenseTensor* input,
+                                 const IntArray& raw_axes,
+                                 DenseTensor* x) {
+  auto input_dim = input->dims();
+  auto rank = input_dim.size();
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  std::vector<int> trans_back;
+  std::vector<int> reshape_back;
+  trans_back.reserve(rank);
+  trans_back.resize(rank);
+
+  int offset = 0;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      reshape_back.push_back(input_dim[i]);
+      trans_back[i] = offset;
+      offset += 1;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      trans_back[i] = offset;
+      reshape_back.push_back(input_dim[i]);
+      offset += 1;
+    }
+  }
+
+  input->Resize(make_ddim(reshape_back));
+  funcs::TransCompute<Context, T>(
+      static_cast<int>(trans_back.size()), dev_ctx, *input, x, trans_back);
+}
+
+template <typename T, typename Context>
+void NanmedianGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& median_index,
+                         const DenseTensor& out_grad,
+                         const IntArray& axes,
+                         bool keep_dim,
+                         DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/nanmedian_kernel.h b/paddle/phi/kernels/nanmedian_kernel.h
new file mode 100644
index 0000000000000..374f420381bdc
--- /dev/null
+++ b/paddle/phi/kernels/nanmedian_kernel.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PreprocessMedianKernel(const Context& dev_ctx,
+                            const DenseTensor& input,
+                            const IntArray& raw_axes,
+                            DenseTensor* x) {
+  auto input_dim = input.dims();
+  auto rank = input_dim.size();
+  std::vector<int> perm;
+  std::vector<int64_t> reshape;
+
+  std::vector<int64_t> axes = raw_axes.GetData();
+  int64_t axes_size = static_cast<int>(axes.size());
+  for (int64_t i = 0; i < axes_size; i++) {
+    if (axes[i] < 0) {
+      axes[i] += rank;
+    }
+  }
+
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) == axes.end()) {
+      perm.push_back(i);
+      reshape.push_back(input_dim[i]);
+    }
+  }
+
+  int64_t post_numel = 1;
+  for (int64_t i = 0; i < rank; i++) {
+    if (std::find(axes.begin(), axes.end(), i) != axes.end()) {
+      perm.push_back(i);
+      post_numel *= input_dim[i];
+    }
+  }
+  reshape.push_back(post_numel);
+
+  DDim trans_dim(input_dim);
+  int ndims = perm.size();
+  for (int i = 0; i < ndims; i++) {
+    trans_dim[i] = input_dim[perm[i]];
+  }
+  x->Resize(trans_dim);
+  dev_ctx.template Alloc<T>(x);
+  funcs::TransCompute<Context, T>(ndims, dev_ctx, input, x, perm);
+
+  x->Resize(make_ddim(reshape));
+}
+
+template <typename T, typename Context>
+void NanmedianKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const IntArray& axes,
+                     bool keep_dim,
+                     DenseTensor* out,
+                     DenseTensor* medians);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/nanmedian_sig.cc b/paddle/phi/ops/compat/nanmedian_sig.cc
new file mode 100644
index 0000000000000..5ca0d450e3b41
--- /dev/null
+++ b/paddle/phi/ops/compat/nanmedian_sig.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature NanmedianOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "nanmedian", {"X"}, {"axis", "keepdim"}, {"Out", "MedianIndex"});
+}
+
+KernelSignature NanmedianGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("nanmedian_grad",
+                         {"X", "MedianIndex", "Out@GRAD"},
+                         {"axis", "keepdim"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(nanmedian, phi::NanmedianOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(nanmedian_grad, phi::NanmedianGradOpArgumentMapping);
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 132105fb2b689..930918e967eed 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -331,6 +331,7 @@
 from .tensor.stat import var  # noqa: F401
 from .tensor.stat import numel  # noqa: F401
 from .tensor.stat import median  # noqa: F401
+from .tensor.stat import nanmedian  # noqa: F401
 from .tensor.stat import quantile  # noqa: F401
 from .tensor.stat import nanquantile  # noqa: F401
 from .device import get_cudnn_version  # noqa: F401
@@ -498,6 +499,7 @@
            'load',
            'numel',
            'median',
+           'nanmedian',
            'quantile',
            'nanquantile',
            'no_grad',
diff --git a/python/paddle/fluid/tests/unittests/test_nanmedian.py b/python/paddle/fluid/tests/unittests/test_nanmedian.py
new file mode 100644
index 0000000000000..2e1f13a8c7d9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nanmedian.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+np.random.seed(102)
+
+
+class TestNanmedian(unittest.TestCase):
+    def setUp(self):
+        single_axis_shape = (120)
+        multi_axis_shape = (2, 3, 4, 5)
+
+        self.fake_data = {
+            "single_axis_normal":
+            np.random.uniform(-1, 1, single_axis_shape).astype(np.float32),
+            "multi_axis_normal":
+            np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32),
+            "single_axis_all_nan": np.full(single_axis_shape, np.nan),
+            "multi_axis_all_nan": np.full(multi_axis_shape, np.nan),
+        }
+
+        single_partial_nan = self.fake_data["single_axis_normal"].copy()
+        single_partial_nan[single_partial_nan > 0] = np.nan
+        multi_partial_nan = self.fake_data["multi_axis_normal"].copy()
+        multi_partial_nan[multi_partial_nan > 0] = np.nan
+        self.fake_data["single_axis_partial_nan"] = single_partial_nan
+        self.fake_data["multi_axis_partial_nan"] = multi_partial_nan
+
+        row_data = np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32)
+        row_data[:, :, :, 0] = np.nan
+        row_data[:, :, :2, 1] = np.nan
+        row_data[:, :, 2:, 2] = np.nan
+        self.fake_data["row_nan_even"] = row_data
+        self.fake_data["row_nan_float64"] = row_data.astype(np.float64)
+        self.fake_data["row_nan_int64"] = row_data.astype(np.int64)
+        self.fake_data["row_nan_int32"] = row_data.astype(np.int32)
+
+        col_data = np.random.uniform(-1, 1, multi_axis_shape).astype(np.float32)
+        col_data[:, :, 0, :] = np.nan
+        col_data[:, :, 1, :3] = np.nan
+        col_data[:, :, 2, 3:] = np.nan
+        self.fake_data["col_nan_odd"] = col_data
+
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+        self.axis_candiate_list = [
+            None, 0, 2, -1, -2, (1, 2), [0, -1], [0, 1, 3], (1, 2, 3),
+            [0, 2, 1, 3]
+        ]
+
+    def test_api_static(self):
+        data = self.fake_data["col_nan_odd"]
+        paddle.enable_static()
+        np_res = np.nanmedian(data, keepdims=True)
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', data.shape)
+            out1 = paddle.nanmedian(x, keepdim=True)
+            out2 = paddle.tensor.nanmedian(x, keepdim=True)
+            out3 = paddle.tensor.stat.nanmedian(x, keepdim=True)
+            axis = np.arange(len(data.shape)).tolist()
+            out4 = paddle.nanmedian(x, axis=axis, keepdim=True)
+            out5 = paddle.nanmedian(x, axis=tuple(axis), keepdim=True)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': data},
+                          fetch_list=[out1, out2, out3, out4, out5])
+
+        for out in res:
+            self.assertTrue(np.allclose(np_res, out, equal_nan=True))
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def clean_axis_numpy(axis, shape_len):
+            if isinstance(axis, tuple):
+                axis = list(axis)
+            if isinstance(axis, list):
+                for k in range(len(axis)):
+                    if axis[k] < 0:
+                        axis[k] += shape_len
+                axis = set(axis)
+            return axis
+
+        def test_data_case(data):
+            for keep_dim in [False, True]:
+                if np.isnan(data).all() and keep_dim:
+                    np_ver = np.version.version.split('.')
+                    if int(np_ver[0]) < 1 or int(np_ver[1]) <= 20:
+                        print(
+                            "This numpy version does not support all nan elements when keepdim is True"
+                        )
+                        continue
+
+                np_res = np.nanmedian(data, keepdims=keep_dim)
+                pd_res = paddle.nanmedian(
+                    paddle.to_tensor(data), keepdim=keep_dim)
+                self.assertTrue(
+                    np.allclose(
+                        np_res, pd_res.numpy(), equal_nan=True))
+
+        def test_axis_case(data, axis):
+            pd_res = paddle.nanmedian(
+                paddle.to_tensor(data), axis=axis, keepdim=False)
+            axis = clean_axis_numpy(axis, len(data.shape))
+            np_res = np.nanmedian(data, axis=axis, keepdims=False)
+            self.assertTrue(np.allclose(np_res, pd_res.numpy(), equal_nan=True))
+
+        for name, data in self.fake_data.items():
+            test_data_case(data)
+
+        for axis in self.axis_candiate_list:
+            test_axis_case(self.fake_data["row_nan_even"], axis)
+            test_axis_case(self.fake_data["col_nan_odd"], axis)
+
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data("X", [10, 12])
+
+            def test_dtype():
+                x2 = paddle.fluid.data('X2', [10, 12], 'bool')
+                paddle.nanmedian(x2)
+
+            def test_empty_axis():
+                paddle.nanmedian(x, axis=[], keepdim=True)
+
+            def test_axis_not_in_range():
+                paddle.nanmedian(x, axis=3, keepdim=True)
+
+            def test_duplicated_axis():
+                paddle.nanmedian(x, axis=[1, -1], keepdim=True)
+
+            self.assertRaises(TypeError, test_dtype)
+            self.assertRaises(ValueError, test_empty_axis)
+            self.assertRaises(ValueError, test_axis_not_in_range)
+            self.assertRaises(ValueError, test_duplicated_axis)
+
+    def test_dygraph(self):
+        paddle.disable_static(place=self.place)
+        with paddle.fluid.dygraph.guard():
+            data = self.fake_data["col_nan_odd"]
+            out = paddle.nanmedian(paddle.to_tensor(data), keepdim=True)
+        np_res = np.nanmedian(data, keepdims=True)
+        self.assertTrue(np.allclose(np_res, out, equal_nan=True))
+        paddle.enable_static()
+
+    def test_check_grad(self):
+        paddle.disable_static(place=self.place)
+        shape = (4, 5)
+        x_np = np.random.uniform(-1, 1, shape).astype(np.float64)
+        x_np[0, :] = np.nan
+        x_np[1, :3] = np.nan
+        x_np[2, 3:] = np.nan
+        x_np_sorted = np.sort(x_np)
+        nan_counts = np.count_nonzero(np.isnan(x_np).astype(np.int32), axis=1)
+        np_grad = np.zeros((shape))
+        for i in range(shape[0]):
+            valid_cnts = shape[1] - nan_counts[i]
+            if valid_cnts == 0:
+                continue
+
+            mid = int(valid_cnts / 2)
+            targets = [x_np_sorted[i, mid]]
+            is_odd = valid_cnts % 2
+            if not is_odd and mid > 0:
+                targets.append(x_np_sorted[i, mid - 1])
+            for j in range(shape[1]):
+                if x_np[i, j] in targets:
+                    np_grad[i, j] = 1 if is_odd else 0.5
+
+        x_tensor = paddle.to_tensor(x_np, stop_gradient=False)
+        y = paddle.nanmedian(x_tensor, axis=1, keepdim=True)
+        dx = paddle.grad(y, x_tensor)[0].numpy()
+        self.assertTrue(np.allclose(np_grad, dx, equal_nan=True))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 283bce1cc817f..478f4b6351fbf 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -263,6 +263,7 @@
 from .stat import var  # noqa: F401
 from .stat import numel  # noqa: F401
 from .stat import median  # noqa: F401
+from .stat import nanmedian  # noqa: F401
 from .stat import quantile  # noqa: F401
 from .stat import nanquantile  # noqa: F401
 
@@ -448,6 +449,7 @@
            'var',
            'numel',
            'median',
+           'nanmedian',
            'quantile',
            'nanquantile',
            'is_complex',
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 52ccc60100996..372454b97a6be 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -241,6 +241,103 @@ def numel(x, name=None):
     return out
 
 
+def nanmedian(x, axis=None, keepdim=True, name=None):
+    r"""
+    Compute the median along the specified axis, while ignoring NaNs.
+
+    If the valid count of elements is a even number,
+    the average value of both elements in the middle is calculated as the median.
+
+    Args:
+        x (Tensor): The input Tensor, it's data type can be int32, int64, float16, float32, float64.
+        axis (None|int|list|tuple, optional):
+            The axis along which to perform median calculations ``axis`` should be int or list of int.
+            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
+            If ``axis`` is None, median is calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is True.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of median along ``axis`` of ``x``. The output dtype is the same as `x`.
+
+    Examples:
+        .. code-block:: python
+            :name: nanmedian-example
+
+            import paddle
+            x = paddle.to_tensor([[float('nan'), 2. , 3. ], [0. , 1. , 2. ]])
+
+            y1 = x.nanmedian()
+            # y1 is [[2.]]
+
+            y2 = x.nanmedian(0)
+            # y2 is [[0.,  1.5, 2.5]]
+
+            y3 = x.nanmedian(0, keepdim=False)
+            # y3 is [0.,  1.5, 2.5]
+
+            y4 = x.nanmedian((0, 1))
+            # y4 is [[2.]]
+    """
+    if not isinstance(x, Variable):
+        raise TypeError("In median, the input x should be a Tensor.")
+
+    if isinstance(axis, (list, tuple)) and len(axis) == 0:
+        raise ValueError("Axis list should not be empty.")
+
+    dims = len(x.shape)
+    if axis is None:
+        axis = []
+    elif isinstance(axis, tuple):
+        axis = list(axis)
+    elif isinstance(axis, int):
+        axis = [axis]
+
+    if not isinstance(axis, list):
+        raise ValueError(
+            "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+        )
+
+    for i in range(len(axis)):
+        if not isinstance(axis[i], int) or not (axis[i] < dims and
+                                                axis[i] >= -dims):
+            raise ValueError(
+                "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
+            )
+        if axis[i] < 0:
+            axis[i] += dims
+
+    if len(axis) != len(set(axis)):
+        raise ValueError("Axis has duplicated elements.")
+
+    if _in_legacy_dygraph():
+        median_index, out = _C_ops.nanmedian(x, 'axis', axis, 'keepdim',
+                                             keepdim)
+        return out
+
+    check_variable_and_dtype(
+        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'],
+        'nanmedian')
+
+    helper = LayerHelper('nanmedian', **locals())
+    attrs = {'axis': axis, 'keepdim': keepdim}
+    out = helper.create_variable_for_type_inference(x.dtype)
+    medians = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='nanmedian',
+        inputs={'X': x},
+        outputs={'Out': out,
+                 'MedianIndex': medians},
+        attrs=attrs)
+    return out
+
+
 def median(x, axis=None, keepdim=False, name=None):
     """
     Compute the median along the specified axis.
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 5088ad3457fb9..7702e8be9c958 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -824,7 +824,7 @@
     'test_mean_op', 'test_is_tensor', 'test_run_program_op',
     'test_cuda_random_seed', 'test_linear_interp_op',
     'test_fuse_all_reduce_pass', 'tensor_util_test', 'test_median',
-    'test_linear', 'test_imperative_qat_amp',
+    'test_nanmedian', 'test_linear', 'test_imperative_qat_amp',
     'test_truncated_gaussian_random_op', 'test_lstm_cudnn_op',
     'copy_same_tensor_test', 'test_squeeze2_op',
     'naive_best_fit_allocator_test', 'test_model', 'test_py_reader_combination',

From 17b8446d459bc3ddde7eee71d04e5ed4c986fbc5 Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Mon, 30 May 2022 17:46:12 +0800
Subject: [PATCH 073/109] [AutoParallel] use original id in grad_op_id_to_op_id
 (#42992)

* use original id in dist_op_context.grad_op_id_to_op_id

* del assert

* remove redundant map
---
 .../distributed/auto_parallel/completion.py   | 21 ++++++----
 .../distributed/auto_parallel/dist_context.py | 10 +++--
 .../auto_parallel/parallelizer_v2.py          |  2 +-
 .../distributed/auto_parallel/partitioner.py  |  8 ++--
 .../distributed/passes/auto_parallel_amp.py   | 42 ++++++++++---------
 .../distributed/passes/auto_parallel_fp16.py  | 40 +++++++++---------
 .../passes/auto_parallel_recompute.py         |  8 ++--
 python/paddle/fluid/backward.py               | 12 ++----
 8 files changed, 74 insertions(+), 69 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 31bdc4cc650af..03996ec350da4 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -771,7 +771,7 @@ def _is_grad_var_name(name):
 
         def _get_op_by_id(ops, id):
             for op in ops:
-                if op.desc.id() == id:
+                if op.desc.original_id() == id:
                     return op
             return None
 
@@ -796,10 +796,12 @@ def _get_op_by_id(ops, id):
             # complete the annotation of grad op (xxx_grad op or sum op)
             # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
             grad_op = ops[idx]
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+            if grad_op.desc.original_id(
+            ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(
-                    ops, dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                forward_op = _get_op_by_id(ops,
+                                           dist_op_context.grad_op_id_to_op_id[
+                                               grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -935,7 +937,7 @@ def _get_forward_varname_from_grad_varname(grad_var_name):
 
         def _get_op_by_id(ops, id):
             for op in ops:
-                if op.desc.id() == id:
+                if op.desc.original_id() == id:
                     return op
             return None
 
@@ -997,11 +999,12 @@ def _get_op_by_id(ops, id):
             # complete the annotation of grad op (xxx_grad op or sum op)
             # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
             grad_op = ops[idx]
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+            if grad_op.desc.original_id(
+            ) in dist_op_context.grad_op_id_to_op_id:
                 # TODO support the case where one forward op corresponding to multiple xxx_grad op
-                forward_op = _get_op_by_id(
-                    ops[:first_backward_op_idx],
-                    dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                forward_op = _get_op_by_id(ops[:first_backward_op_idx],
+                                           dist_op_context.grad_op_id_to_op_id[
+                                               grad_op.desc.original_id()])
                 assert forward_op is not None
 
                 if grad_op.type == "concat" and forward_op.type == "split":
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 7299f84504bf3..a47ef66ee848a 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -204,9 +204,13 @@ def initialize(self):
             )
             self._serial_startup_program = self._original_serial_startup_program.clone(
             )
-            self._serial_main_program = self._original_serial_main_program
-            self._serial_startup_program = self._original_serial_startup_program
-            self._serial_loss = self._original_serial_loss
+            # self._serial_main_program = self._original_serial_main_program
+            # self._serial_startup_program = self._original_serial_startup_program
+            if self._original_serial_loss:
+                self._serial_loss = self._serial_main_program.global_block(
+                ).vars[self._original_serial_loss[0].name]
+            else:
+                self._serial_loss = self._original_serial_loss
             self._serial_optimizer = self._original_serial_optimizer
             self._init_dist_attr_for_program()
             self._tensors_ids = list(self._dist_tensors_for_program.keys())
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 6a94bbd3130b9..4d73632761026 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -51,7 +51,7 @@ def parallel(self, rank):
         serial_optimizer = self._dist_context.serial_optimizer
         if self._mode == "train" and serial_optimizer:
             # Generate backward
-            serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
+            serial_loss = self._dist_context.serial_loss
             params_grads = self._generate_backward(
                 serial_main_program, serial_startup_program, serial_loss)
             # Apply pre optimization passes
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 91a31dd1b922e..ce686fd6a5683 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -211,7 +211,7 @@ def partition_block(self, ref_block, target_block):
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
             if idx <= last_fwd_op_idx:
-                forward_op_id2forward_op[serial_ops[idx].desc.id(
+                forward_op_id2forward_op[serial_ops[idx].desc.original_id(
                 )] = serial_ops[idx]
 
         appended_grad_times = 0
@@ -408,9 +408,9 @@ def _partition_var(dist_context, src_block, dst_block, src_varname,
 def _get_dist_op_backward_implement(backward_op, dist_context,
                                     forward_op_id2forward_op):
     dist_op_context = dist_context.dist_op_context
-    if backward_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-        forward_op_id = dist_op_context.grad_op_id_to_op_id[backward_op.desc.id(
-        )]
+    if backward_op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[
+            backward_op.desc.original_id()]
         forward_op = forward_op_id2forward_op[forward_op_id]
         forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
             forward_op)
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index fe94c25e12d2d..3cd04affa29c2 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -46,13 +46,13 @@ def _build_stats(self, amp_lists, dist_context):
             if int(op.attr('op_role')) == int(OpRole.Forward):
                 self._mark_black_white_ops(amp_lists)
             elif int(op.attr('op_role')) == int(OpRole.Backward):
-                if op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[op.desc.id(
-                    )]
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    fwd_op_id = dist_op_context.grad_op_id_to_op_id[
+                        op.desc.original_id()]
                     if self._is_fp16_op(fwd_op_id) == True:
-                        self._op_fp16_dict[op.desc.id()] = True
+                        self._op_fp16_dict[op.desc.original_id()] = True
                     elif self._is_fp16_op(fwd_op_id) == False:
-                        self._op_fp16_dict[op.desc.id()] = False
+                        self._op_fp16_dict[op.desc.original_id()] = False
             elif int(op.attr('op_role')) == int(OpRole.Optimize):
                 break
 
@@ -70,12 +70,12 @@ def _mark_black_white_ops(self, amp_lists):
                 continue
             if amp_lists.black_varnames is not None and _is_in_black_varnames(
                     op, amp_lists):
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
                 continue
             if op.type in amp_lists.black_list:
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
             elif op.type in amp_lists.white_list:
-                self._op_fp16_dict[op.desc.id()] = True
+                self._op_fp16_dict[op.desc.original_id()] = True
             elif op.type in amp_lists.gray_list:
                 is_black_op = False
                 is_white_op = False
@@ -95,22 +95,22 @@ def _mark_black_white_ops(self, amp_lists):
                             else:
                                 prev_op = in_var.op
                             # if it's one of inputs
-                            if self._is_fp16_op(prev_op.desc.id()) == False or \
+                            if self._is_fp16_op(prev_op.desc.original_id()) == False or \
                                     prev_op.type in amp_lists.black_list:
                                 is_black_op = True
-                            elif self._is_fp16_op(prev_op.desc.id()) == True or \
+                            elif self._is_fp16_op(prev_op.desc.original_id()) == True or \
                                     prev_op.type in amp_lists.white_list:
                                 is_white_op = True
                 if is_black_op:
-                    self._op_fp16_dict[op.desc.id()] = False
+                    self._op_fp16_dict[op.desc.original_id()] = False
                 elif is_white_op:
-                    self._op_fp16_dict[op.desc.id()] = True
+                    self._op_fp16_dict[op.desc.original_id()] = True
                 else:
                     pass
             else:
                 # For numerical safe, we apply fp32 computation on ops that
                 # are not determined which list they should stay.
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
 
     def cast_forward_program(self, dist_context):
         ops = self._block.ops
@@ -120,11 +120,11 @@ def cast_forward_program(self, dist_context):
             num_cast_ops = 0
             if int(op.attr('op_role')) == int(OpRole.Backward):
                 break
-            if self._is_fp16_op(op.desc.id()) == False:
+            if self._is_fp16_op(op.desc.original_id()) == False:
                 num_cast_ops = self._insert_cast_op_forward(
                     op, idx, core.VarDesc.VarType.FP16,
                     core.VarDesc.VarType.FP32, dist_context)
-            elif self._is_fp16_op(op.desc.id()) == True:
+            elif self._is_fp16_op(op.desc.original_id()) == True:
                 num_cast_ops = self._insert_cast_op_forward(
                     op, idx, core.VarDesc.VarType.FP32,
                     core.VarDesc.VarType.FP16, dist_context)
@@ -198,7 +198,7 @@ def _insert_cast_op_forward(self, op, idx, src_dtype, dst_dtype,
                 else:
                     if op.has_attr('in_dtype'):
                         op._set_attr('in_dtype', dst_dtype)
-        self._var_name_dict[op.desc.id()] = var_name_dict
+        self._var_name_dict[op.desc.original_id()] = var_name_dict
 
         if src_dtype == core.VarDesc.VarType.FP32 and dst_dtype == core.VarDesc.VarType.FP16:
             for out_name in op.output_names:
@@ -225,13 +225,14 @@ def cast_backward_program(self, params_grads, dist_context):
         while idx < len(ops):
             num_cast_ops = 0
             grad_op = ops[idx]
+            grad_op_orig_id = grad_op.desc.original_id()
             dist_op_context = dist_context.dist_op_context
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                if self._is_fp16_op(grad_op.desc.id()) == False:  # fp32
+            if grad_op_orig_id in dist_op_context.grad_op_id_to_op_id:
+                if self._is_fp16_op(grad_op_orig_id) == False:  # fp32
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP16,
                         core.VarDesc.VarType.FP32, dist_context)
-                elif self._is_fp16_op(grad_op.desc.id()) == True:  # fp16
+                elif self._is_fp16_op(grad_op_orig_id) == True:  # fp16
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP32,
                         core.VarDesc.VarType.FP16, dist_context)
@@ -272,8 +273,9 @@ def _keep_fp32_output(op, out_name):
             return False
 
         num_cast_ops = 0
+        original_id = grad_op.desc.original_id()
         dist_op_context = dist_context.dist_op_context
-        fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()]
+        fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
 
         for in_name in grad_op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 9dda310e5c022..b01f3975aefdd 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -153,23 +153,24 @@ def _mark_op(self, op):
 
             # ernie inference trick
             if op.type == "assign" and "array_" in op.input_arg_names[0]:
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
                 return
             if _need_keep_fp32(op, self.amp_list.unsupported_list,
                                self.use_fp16_guard):
-                self._op_fp16_dict[op.desc.id()] = False
+                self._op_fp16_dict[op.desc.original_id()] = False
             else:
-                self._op_fp16_dict[op.desc.id()] = True
+                self._op_fp16_dict[op.desc.original_id()] = True
             for var_name in op.output_arg_names:
                 # assert var_name not in self.forward_non_leaf_tensors, "{}".format(var_name)
                 self.forward_non_leaf_tensors[var_name] = op.desc.id()
 
         elif is_backward_op(op) == int(OpRole.Backward):
 
-            if op.desc.id() in self.grad_op_to_op_map:
-                fwd_op_id = self.grad_op_to_op_map[op.desc.id()]
+            if op.desc.original_id() in self.grad_op_to_op_map:
+                fwd_op_id = self.grad_op_to_op_map[op.desc.original_id()]
                 assert fwd_op_id in self._op_fp16_dict, "{}".format(str(op))
-                self._op_fp16_dict[op.desc.id()] = self._op_fp16_dict[fwd_op_id]
+                self._op_fp16_dict[op.desc.original_id()] = self._op_fp16_dict[
+                    fwd_op_id]
 
         if int(op.attr('op_role')) == 257:
             self.is_train = True
@@ -192,10 +193,10 @@ def set_var_to_fp16(self, var_name, block):
     def resolute_tensor_dtype(self, block):
 
         for op in block.ops:
-            op_id = op.desc.id()
             if is_forward_op(op):
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                if self._is_fp16_op(op_id) == True or op.type == "cast":
+                if self._is_fp16_op(op.desc.original_id()) == True \
+                    or op.type == "cast":
                     for in_name in op.input_names:
                         if _keep_fp32_input(op, in_name):
                             continue
@@ -209,7 +210,7 @@ def resolute_tensor_dtype(self, block):
                             self.set_var_to_fp16(out_var_name, block)
                     set_op_dtype_to_fp16(op)
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                elif self._is_fp16_op(op_id) == False:
+                elif self._is_fp16_op(op.desc.original_id()) == False:
                     for out_var_name in op.output_arg_names:
                         out_var = block.vars.get(out_var_name)
                         if out_var is None or out_var.type not in _valid_types:
@@ -217,7 +218,7 @@ def resolute_tensor_dtype(self, block):
                         if out_var.dtype == core.VarDesc.VarType.FP16:
                             out_var.desc.set_dtype(core.VarDesc.VarType.FP32)
             elif is_backward_op(op):
-                if self._is_fp16_op(op_id) == True:
+                if self._is_fp16_op(op.desc.original_id()) == True:
                     for out_name in op.output_names:
                         if _keep_fp32_output(op, out_name):
                             continue
@@ -225,7 +226,7 @@ def resolute_tensor_dtype(self, block):
                             self.set_var_to_fp16(out_var_name, block)
                     set_op_dtype_to_fp16(op)
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                elif self._is_fp16_op(op_id) == False:
+                elif self._is_fp16_op(op.desc.original_id()) == False:
                     for out_var_name in op.output_arg_names:
                         out_var = block.vars.get(out_var_name)
                         if out_var is None or out_var.type not in _valid_types:
@@ -238,28 +239,27 @@ def cast_block(self, block):
         idx = 0
         while idx < len(block.ops):
             op = block.ops[idx]
-            op_id = op.desc.id()
             num_cast_ops = 0
 
             if op.type in __amp_skip_ops__:
                 idx += 1
                 continue
             elif is_forward_op(op):
-                if self._is_fp16_op(op_id) == False:
+                if self._is_fp16_op(op.desc.original_id()) == False:
                     num_cast_ops = self._insert_forward_cast_ops(
                         op, idx, block, core.VarDesc.VarType.FP16,
                         core.VarDesc.VarType.FP32, self.dist_context)
-                elif self._is_fp16_op(op_id) == True:
+                elif self._is_fp16_op(op.desc.original_id()) == True:
                     num_cast_ops = self._insert_forward_cast_ops(
                         op, idx, block, core.VarDesc.VarType.FP32,
                         core.VarDesc.VarType.FP16, self.dist_context)
             elif is_backward_op(op):
-                if op_id in dist_op_context.grad_op_id_to_op_id:
-                    if self._is_fp16_op(op_id) == False:
+                if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
+                    if self._is_fp16_op(op.desc.original_id()) == False:
                         num_cast_ops = self._insert_backward_cast_ops(
                             op, idx, block, core.VarDesc.VarType.FP16,
                             core.VarDesc.VarType.FP32, self.dist_context)
-                    elif self._is_fp16_op(op_id) == True:
+                    elif self._is_fp16_op(op.desc.original_id()) == True:
                         num_cast_ops = self._insert_backward_cast_ops(
                             op, idx, block, core.VarDesc.VarType.FP32,
                             core.VarDesc.VarType.FP16, self.dist_context)
@@ -282,7 +282,6 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                                  dist_context):
 
         num_cast_ops = 0
-        op_id = op.desc.id()
 
         for in_name in op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
@@ -300,7 +299,7 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     cast_name = in_var.name + '.cast_' + _dtype_to_str(
                         dst_dtype)
                     cast_var = block.vars.get(cast_name)
-                    self.forward_input_cast_ops[op_id] += [(
+                    self.forward_input_cast_ops[op.desc.original_id()] += [(
                         cast_name, in_var.name, dst_dtype, src_dtype, in_name)]
 
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
@@ -349,8 +348,9 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
 
         num_cast_ops = 0
         op_id = op.desc.id()
+        original_id = op.desc.original_id()
         dist_op_context = dist_context.dist_op_context
-        forward_op_id = dist_op_context.grad_op_id_to_op_id[op_id]
+        forward_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
 
         grad_op_attr = dist_context.get_op_dist_attr_for_program(op)
         assert grad_op_attr is not None
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 258f46304d189..c6d1685446277 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -315,7 +315,7 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
             # When traversing all grad_ops in reverse, need to set a flag to indicate 
             # whether the ckpt and its segment_descs can be used.
             ckpt_op = op_path[segment[1] - 1]
-            ckpt_ops_dict[ckpt_op.desc.id()] = [True, segment_descs]
+            ckpt_ops_dict[ckpt_op.desc.original_id()] = [True, segment_descs]
 
         # step 4: insert recomputed fwd ops
         ops = main_block.ops
@@ -339,9 +339,9 @@ def _apply_single_impl(self, main_programs, startup_programs, context):
                 _rename_arg_([grad_op.desc], key, var_name_dict[key])
 
             # insert recomputed ops
-            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-                fwd_op_id = dist_op_context.grad_op_id_to_op_id[grad_op.desc.id(
-                )]
+            original_id = grad_op.desc.original_id()
+            if original_id in dist_op_context.grad_op_id_to_op_id:
+                fwd_op_id = dist_op_context.grad_op_id_to_op_id[original_id]
                 if fwd_op_id in ckpt_ops_dict and ckpt_ops_dict[fwd_op_id][0]:
                     idx = grad_op.idx
                     while idx - 1 >= 0 and ops[idx - 1].type == "sum":
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 145ecc83cfc26..ed3e0bc98ed6d 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1107,8 +1107,10 @@ def update_distop_context(distop_context, op_grad_to_var,
         distop_context.grad_var_to_var[appending_grad_times].update(
             op_grad_to_var)
         for op_desc in grad_op_desc:
-            assert op_desc.id() not in distop_context.grad_op_id_to_op_id
-            distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+            assert op_desc.original_id(
+            ) not in distop_context.grad_op_id_to_op_id
+            distop_context.grad_op_id_to_op_id[op_desc.original_id(
+            )] = op.desc.original_id()
 
     if callbacks is not None:
         assert (isinstance(callbacks, (list, tuple)))
@@ -1255,12 +1257,6 @@ def update_distop_context(distop_context, op_grad_to_var,
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
-        # Rebuild the mapping because new_op_desc has a differnt id (Only for auto parallel)
-        if distop_context is not None:
-            if op_desc.id() in distop_context.grad_op_id_to_op_id:
-                distop_context.grad_op_id_to_op_id[new_op_desc.id(
-                )] = distop_context.grad_op_id_to_op_id[op_desc.id()]
-                distop_context.grad_op_id_to_op_id.pop(op_desc.id())
         new_op_desc._set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:

From fdcdbec5330efbe850d648f9444d60ce7881f4dc Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Mon, 30 May 2022 17:56:51 +0800
Subject: [PATCH 074/109] Implement fused_gate_attention operator for
 AlphaFold. (#42018)

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   4 +-
 paddle/fluid/operators/fused/attn_gemm.h      | 164 +++--
 paddle/fluid/operators/fused/fmha_ref.h       |   4 +-
 .../operators/fused/fused_gate_attention.h    | 647 ++++++++++++++++++
 .../fused/fused_gate_attention_op.cc          | 317 +++++++++
 .../fused/fused_gate_attention_op.cu          | 488 +++++++++++++
 paddle/fluid/platform/device/gpu/gpu_info.cc  |   4 +-
 paddle/fluid/pybind/op_function_generator.h   |   7 +
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    |  42 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../unittests/test_fused_gate_attention_op.py | 252 +++++++
 11 files changed, 1821 insertions(+), 109 deletions(-)
 create mode 100644 paddle/fluid/operators/fused/fused_gate_attention.h
 create mode 100644 paddle/fluid/operators/fused/fused_gate_attention_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_gate_attention_op.cu
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 03351dbca09e5..a86d26bcd58a7 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -23,7 +23,8 @@ register_operators(EXCLUDES
     fused_feedforward_op
     fused_multi_transformer_op
     resnet_unit_op
-    fused_gemm_epilogue_op)
+    fused_gemm_epilogue_op
+    fused_gate_attention_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -58,6 +59,7 @@ if (WITH_GPU OR WITH_ROCM)
     op_library(yolo_box_head_op)
     op_library(yolo_box_post_op)
     op_library(fused_embedding_eltwise_layernorm_op)
+    op_library(fused_gate_attention_op)
     # fusion_group
     if(NOT APPLE AND NOT WIN32)
         op_library(fusion_group_op DEPS device_code)
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 9542f0742ea34..304aad16ad0c6 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
@@ -21,6 +25,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
+using Tensor = framework::Tensor;
 // support gemm-nt and gemm-nn, which is used in fused_attention_op.
 template <typename T>
 class AttnMatMul {
@@ -45,31 +51,21 @@ class AttnMatMul {
                       framework::Tensor* bias_out) {
     // Note: for blas.GEMM API in Paddle, it treats all inputs as row-major.
     // here: (transa, transb): nt, input * weight.
-    CBLAS_TRANSPOSE transA = CblasNoTrans;
-    CBLAS_TRANSPOSE transB = CblasNoTrans;
-    if (transA_) {
-      transA = CblasTrans;
-    }
-    if (transB_) {
-      transB = CblasTrans;
-    }
+    CBLAS_TRANSPOSE transA = transA_ ? CblasTrans : CblasNoTrans;
+    CBLAS_TRANSPOSE transB = transB_ ? CblasTrans : CblasNoTrans;
     T alpha = static_cast<T>(1.0);
     T beta = static_cast<T>(0.0);
 
-    // here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
+    // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
     auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
               input->data<T>(), weight->data<T>(), beta, output->data<T>());
     if (compute_bias_) {
-      // compute output + bias
-      std::vector<const Tensor*> ins;
-      std::vector<Tensor*> outs;
-      ins.emplace_back(output);
-      ins.emplace_back(bias);
-      outs.emplace_back(bias_out);
-      int elewise_add_axis = -1;
+      // bias_out = output + bias
+      std::vector<const Tensor*> ins = {output, bias};
+      std::vector<Tensor*> outs = {bias_out};
       phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
-          dev_ctx_, ins, &outs, elewise_add_axis, phi::funcs::AddFunctor<T>());
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
     }
   }
 
@@ -77,82 +73,71 @@ class AttnMatMul {
                        const framework::Tensor* weight,
                        const framework::Tensor* d_output,
                        framework::Tensor* d_input, framework::Tensor* d_weight,
-                       framework::Tensor* d_bias) {
+                       framework::Tensor* d_bias, bool use_addto = false) {
     T alpha = static_cast<T>(1.0);
-    T beta = static_cast<T>(0.0);
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
-
-    CBLAS_TRANSPOSE dB_transA = CblasNoTrans;
-    CBLAS_TRANSPOSE dB_transB = CblasNoTrans;
-    CBLAS_TRANSPOSE dA_transA = CblasNoTrans;
-    CBLAS_TRANSPOSE dA_transB = CblasNoTrans;
-    int dB_m = 1;
-    int dB_n = 1;
-    int dB_k = 1;
-    int dA_m = 1;
-    int dA_n = 1;
-    int dA_k = 1;
-
-    T* dB_input_1_ptr = nullptr;
-    T* dB_input_2_ptr = nullptr;
-    T* dB_output_ptr = d_weight->data<T>();
-
-    T* dA_input_1_ptr = nullptr;
-    T* dA_input_2_ptr = nullptr;
-    T* dA_output_ptr = d_input->data<T>();
+    T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
+    T beta_dB = static_cast<T>(0.0);
 
+    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
     if (!transA_) {
-      // fw: gemm-nt
+      // forward: gemm-nt
       if (transB_) {
-        // bw: gemm-tn, dB = (dC)^t * A
-        dB_transA = CblasTrans;
-        dB_transB = CblasNoTrans;
-        dB_m = output_size_;
-        dB_n = input_size_;
-        dB_k = bsz_seq_;
-
-        // bw: gemm-nn, dA = dC * B
-        dA_transA = CblasNoTrans;
-        dA_transB = CblasNoTrans;
-        dA_m = bsz_seq_;
-        dA_n = input_size_;
-        dA_k = output_size_;
-
-        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha,
-                  d_output->data<T>(), input->data<T>(), beta, dB_output_ptr);
-        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha,
-                  d_output->data<T>(), weight->data<T>(), beta, dA_output_ptr);
+        // backward: gemm-tn, dB = (dC)^T * A
+        if (d_weight) {
+          int dB_m = output_size_;
+          int dB_n = input_size_;
+          int dB_k = bsz_seq_;
+
+          T* dB_output_ptr = d_weight->data<T>();
+          blas.GEMM(CblasTrans, CblasNoTrans, dB_m, dB_n, dB_k, alpha,
+                    d_output->data<T>(), input->data<T>(), beta_dB,
+                    dB_output_ptr);
+        }
+
+        // backward: gemm-nn, dA = dC * B
+        if (d_input) {
+          int dA_m = bsz_seq_;
+          int dA_n = input_size_;
+          int dA_k = output_size_;
+
+          T* dA_output_ptr = d_input->data<T>();
+          blas.GEMM(CblasNoTrans, CblasNoTrans, dA_m, dA_n, dA_k, alpha,
+                    d_output->data<T>(), weight->data<T>(), beta_dA,
+                    dA_output_ptr);
+        }
       } else {  // fw: gemm-nn
-        // bw: gemm-tn, dB = A^t * dC
-        dB_transA = CblasTrans;
-        dB_transB = CblasNoTrans;
-        dB_m = input_size_;
-        dB_n = output_size_;
-        dB_k = bsz_seq_;
-
-        // bw: gemm-nt, dA = dC * B^t
-        dA_transA = CblasNoTrans;
-        dA_transB = CblasTrans;
-        dA_m = bsz_seq_;
-        dA_n = input_size_;
-        dA_k = output_size_;
-
-        blas.GEMM(dB_transA, dB_transB, dB_m, dB_n, dB_k, alpha,
-                  input->data<T>(), d_output->data<T>(), beta, dB_output_ptr);
-        blas.GEMM(dA_transA, dA_transB, dA_m, dA_n, dA_k, alpha,
-                  d_output->data<T>(), weight->data<T>(), beta, dA_output_ptr);
+        // backward: gemm-tn, dB = A^T * dC
+        if (d_weight) {
+          int dB_m = input_size_;
+          int dB_n = output_size_;
+          int dB_k = bsz_seq_;
+
+          T* dB_output_ptr = d_weight->data<T>();
+          blas.GEMM(CblasTrans, CblasNoTrans, dB_m, dB_n, dB_k, alpha,
+                    input->data<T>(), d_output->data<T>(), beta_dB,
+                    dB_output_ptr);
+        }
+
+        // backward: gemm-nt, dA = dC * B^T
+        if (d_input) {
+          int dA_m = bsz_seq_;
+          int dA_n = input_size_;
+          int dA_k = output_size_;
+
+          T* dA_output_ptr = d_input->data<T>();
+          blas.GEMM(CblasNoTrans, CblasTrans, dA_m, dA_n, dA_k, alpha,
+                    d_output->data<T>(), weight->data<T>(), beta_dA,
+                    dA_output_ptr);
+        }
       }
-    } else if (transB_) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=T)"
-          "parameters."));
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "AttnMatMul wrapper do not support (transA=T, transB=N)"
+          "AttnMatMul wrapper do not support (transA=T, transB=T/N)"
           "parameters."));
     }
-    if (compute_bias_) {
-      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2}
+    if (compute_bias_ && d_bias) {
+      // reduce: {0, 1, 2, 3, 4} -> {2, 3, 4} or {0, 1, 2} -> {2} or {0,1,2,3}
+      // -> {3} or {0,1,2,3,4} -> {3,4}
       const auto input_dims = d_output->dims();
       const auto output_dims = d_bias->dims();
       bool support_case_1 =
@@ -163,11 +148,22 @@ class AttnMatMul {
       bool support_case_2 =
           (input_dims.size() == 3 && output_dims.size() == 1 &&
            (input_dims[2] == output_dims[0]));
+      bool support_case_3 =
+          (input_dims.size() == 4 && output_dims.size() == 1 &&
+           input_dims[3] == output_dims[0]);
+      bool support_case_4 =
+          (input_dims.size() == 5 && output_dims.size() == 2 &&
+           input_dims[3] == output_dims[0] && input_dims[4] == output_dims[1]);
+
+      gpuStream_t stream = dev_ctx_.stream();
       if (support_case_1 || support_case_2) {
-        gpuStream_t stream = dev_ctx_.stream();
         TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
             dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1},
             stream);
+      } else if (support_case_3 || support_case_4) {
+        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+            dev_ctx_, *d_output, d_bias, kps::IdentityFunctor<T>(), {0, 1, 2},
+            stream);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Only support reduce when the input dims are [0,1,2,3,4] and "
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 0e9fba73933b7..38f9aff226ea9 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -297,7 +300,6 @@ class FMHARef {
       phi::SoftmaxBackwardCUDAKernelDriver<T>(
           dev_ctx_, softmax_out_tensor, *softmax_out_grad_tensor, softmax_axis,
           src_mask_out_grad_tensor);
-
       // recall LaunchElementwiseCudaKernel fw:  src_mask_out = qk_out +
       // src_mask
       // Special case when dy is not needed and dx doesn't reduce
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
new file mode 100644
index 0000000000000..cda33987d68ac
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -0,0 +1,647 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::string MemoryDebugString(const Tensor& t) {
+  std::stringstream ss;
+  ss << "shape=[" << t.dims()
+     << "], size=" << static_cast<float>(t.memory_size()) / (1 << 20)
+     << " MB, ptr=" << t.data();
+
+  size_t total = 0;
+  size_t available = 0;
+  platform::GpuMemoryUsage(&available, &total);
+  ss << "; memory allocated="
+     << static_cast<float>(total - available) / (1 << 20) << " MB";
+  return ss.str();
+}
+
+template <typename T>
+struct TernaryAddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; }
+};
+
+template <typename T>
+struct GateAttentionConfig {
+ public:
+  int64_t batch_size;
+  int64_t seq_len_m;
+  int64_t seq_len_r;
+  int64_t q_dim;
+  int64_t kv_dim;
+  int64_t key_dim;
+  int64_t m_size;
+  int64_t num_heads;
+
+  phi::DDim qkv_out_dims;
+  phi::DDim qkv_transpose_out_dims;
+
+  phi::DDim q_out_dims;
+  phi::DDim kv_out_dims;
+  phi::DDim q_transpose_out_dims;
+  phi::DDim kv_transpose_out_dims;
+
+  phi::DDim qk_out_dims;
+  phi::DDim softmax_out_dims;
+  phi::DDim qktv_out_dims;
+  phi::DDim gate_out_dims;
+
+  GateAttentionConfig(const Tensor* query, const Tensor* key,
+                      const Tensor* query_weight, const Tensor* qkv_weight,
+                      bool merge_qkv) {
+    // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
+    batch_size = query->dims()[0];
+    seq_len_m = query->dims()[1];
+    seq_len_r = query->dims()[2];
+    q_dim = query->dims()[3];
+
+    if (merge_qkv) {
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_weight,
+          platform::errors::NotFound("The input qkv_weight can not be nullptr "
+                                     "when merge_qkv is true."));
+
+      // When q_dim == kv_dim, QKV matmul can be computed merged.
+      // qkv_weight: shape=[3, num_heads, key_dim, q_dim]
+      num_heads = qkv_weight->dims()[1];
+      key_dim = qkv_weight->dims()[2];
+      m_size = seq_len_r;
+      kv_dim = q_dim;
+
+      qkv_out_dims = {batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim};
+      qkv_transpose_out_dims = {3,         batch_size, seq_len_m,
+                                num_heads, seq_len_r,  key_dim};
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          key,
+          platform::errors::NotFound(
+              "The input key can not be nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          query_weight,
+          platform::errors::NotFound("The input query_weight can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      // When q_dim != kv_dim, QKV matmul must be computed saparately.
+      // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
+      // query_w: shape=[q_dim, num_heads, key_dim]
+      num_heads = query_weight->dims()[1];
+      key_dim = query_weight->dims()[2];
+      m_size = key->dims()[2];
+      kv_dim = key->dims()[3];
+
+      q_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, key_dim};
+      kv_out_dims = {batch_size, seq_len_m, m_size, num_heads, key_dim};
+      q_transpose_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r,
+                              key_dim};
+      kv_transpose_out_dims = {batch_size, seq_len_m, num_heads, m_size,
+                               key_dim};
+    }
+
+    qk_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, m_size};
+    softmax_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, m_size};
+    qktv_out_dims = {batch_size, seq_len_m, num_heads, seq_len_r, key_dim};
+    gate_out_dims = {batch_size, seq_len_m, seq_len_r, num_heads, key_dim};
+  }
+
+  int64_t GetQuerySize() const {
+    return batch_size * seq_len_m * seq_len_r * num_heads * key_dim;
+  }
+
+  Tensor* GetQKVOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!qkv_out.IsInitialized()) {
+      qkv_out.Resize(qkv_out_dims);
+      qkv_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "qkv_out: " << MemoryDebugString(qkv_out);
+    }
+    return &qkv_out;
+  }
+
+  Tensor* GetQueryOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!query_out.IsInitialized()) {
+      query_out.Resize(q_out_dims);
+      query_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "query_out: " << MemoryDebugString(query_out);
+    }
+    return &query_out;
+  }
+
+  Tensor* GetKeyOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!key_out.IsInitialized()) {
+      key_out.Resize(kv_out_dims);
+      key_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "key_out: " << MemoryDebugString(key_out);
+    }
+    return &key_out;
+  }
+
+  Tensor* GetValueOut(const platform::CUDADeviceContext& dev_ctx) {
+    if (!value_out.IsInitialized()) {
+      value_out.Resize(kv_out_dims);
+      value_out.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "value_out: " << MemoryDebugString(value_out);
+    }
+    return &value_out;
+  }
+
+  Tensor* GetQKOut(const platform::CUDADeviceContext& dev_ctx,
+                   Tensor* softmax_out) {
+    // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
+    int softmax_dim = m_size;
+    if (!softmax_out || phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+      // Not sure whether cudnn softmax can execute inplace.
+      if (!qkv_out.IsInitialized()) {
+        qk_out.Resize(qk_out_dims);
+        qk_out.mutable_data<T>(dev_ctx.GetPlace());
+        VLOG(4) << "qk_out: " << MemoryDebugString(qk_out);
+      }
+      return &qk_out;
+    } else {
+      return softmax_out;
+    }
+  }
+
+  void ClearQKVOut() {
+    if (qkv_out.IsInitialized()) {
+      qkv_out.clear();
+    }
+  }
+
+  void ClearQKOut() {
+    if (qk_out.IsInitialized()) {
+      qk_out.clear();
+    }
+  }
+
+ protected:
+  Tensor qkv_out;
+  // QKV is not merged
+  Tensor query_out;
+  Tensor key_out;
+  Tensor value_out;
+  // qk_out = BatchedGEMM(Q, K^T)
+  // qk_out: shape=[batch_size, seq_len_m, num_heads, seq_len_r, m_size]
+  // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
+  // The shape of qk_out, softmax_out is the same, thus can be called inplace.
+  Tensor qk_out;
+};
+
+template <typename T>
+struct GateAttentionGradConfig : public GateAttentionConfig<T> {
+ public:
+  GateAttentionGradConfig(const Tensor* query, const Tensor* key,
+                          const Tensor* query_weight, const Tensor* qkv_weight,
+                          bool merge_qkv)
+      : GateAttentionConfig<T>(query, key, query_weight, qkv_weight,
+                               merge_qkv) {}
+
+  Tensor* GetQKVOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!qkv_out_grad.IsInitialized()) {
+      qkv_out_grad.Resize(this->qkv_out_dims);
+      qkv_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "qkv_out_grad: " << MemoryDebugString(qkv_out_grad);
+    }
+    return &qkv_out_grad;
+  }
+
+  Tensor* GetQueryOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!query_out_grad.IsInitialized()) {
+      query_out_grad.Resize(this->q_out_dims);
+      query_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "query_out_grad: " << MemoryDebugString(query_out_grad);
+    }
+    return &query_out_grad;
+  }
+
+  Tensor* GetKeyOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!key_out_grad.IsInitialized()) {
+      key_out_grad.Resize(this->kv_out_dims);
+      key_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "key_out_grad: " << MemoryDebugString(key_out_grad);
+    }
+    return &key_out_grad;
+  }
+
+  Tensor* GetValueOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+    if (!value_out_grad.IsInitialized()) {
+      value_out_grad.Resize(this->kv_out_dims);
+      value_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+      VLOG(4) << "value_out_grad: " << MemoryDebugString(value_out_grad);
+    }
+    return &value_out_grad;
+  }
+
+  Tensor* GetQKOutGrad(const platform::CUDADeviceContext& dev_ctx,
+                       Tensor* softmax_out_grad) {
+    // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
+    int softmax_dim = this->m_size;
+    if (!softmax_out_grad ||
+        phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+      if (!qk_out_grad.IsInitialized()) {
+        qk_out_grad.Resize(this->qk_out_dims);
+        qk_out_grad.mutable_data<T>(dev_ctx.GetPlace());
+        VLOG(4) << "qk_out_grad: " << MemoryDebugString(qk_out_grad);
+      }
+      return &qk_out_grad;
+    } else {
+      return softmax_out_grad;
+    }
+  }
+
+ protected:
+  Tensor qkv_out_grad;
+  Tensor query_out_grad;
+  Tensor key_out_grad;
+  Tensor value_out_grad;
+  Tensor qk_out_grad;
+};
+
+template <typename T>
+class FMHAGateRef {
+ public:
+  FMHAGateRef(const platform::CUDADeviceContext& dev_ctx, bool merge_qkv)
+      : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {}
+
+  void ComputeForward(const Tensor* nonbatched_bias, const Tensor* src_mask,
+                      Tensor* q_transpose_out, Tensor* k_transpose_out,
+                      Tensor* v_transpose_out, Tensor* qkv_transpose_out,
+                      Tensor* softmax_out, Tensor* fmha_out,
+                      GateAttentionConfig<T>* config) {
+    T* q_ptr = nullptr;
+    T* k_ptr = nullptr;
+    T* v_ptr = nullptr;
+    if (merge_qkv_) {
+      // qkv_transpose_out = transpose(qkv_out)
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_transpose_out,
+          platform::errors::NotFound("The input qkv_transpose_out can not be "
+                                     "nullptr when merge_qkv is true."));
+
+      Tensor* qkv_out = config->GetQKVOut(dev_ctx_);
+      ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out);
+      config->ClearQKVOut();
+
+      // q_size == k_size
+      int64_t q_size = config->GetQuerySize();
+      q_ptr = qkv_transpose_out->data<T>();
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + q_size;
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          q_transpose_out,
+          platform::errors::NotFound("The input q_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          k_transpose_out,
+          platform::errors::NotFound("The input k_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          v_transpose_out,
+          platform::errors::NotFound("The input v_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      Tensor* query_out = config->GetQueryOut(dev_ctx_);
+      Tensor* key_out = config->GetKeyOut(dev_ctx_);
+      Tensor* value_out = config->GetValueOut(dev_ctx_);
+      ComputeQKVTransposeForward(*query_out, *key_out, *value_out,
+                                 q_transpose_out, k_transpose_out,
+                                 v_transpose_out);
+
+      // q_size != k_size
+      q_ptr = q_transpose_out->data<T>();
+      k_ptr = k_transpose_out->data<T>();
+      v_ptr = v_transpose_out->data<T>();
+    }
+
+    // qk_out = BatchedGEMM(Q, K^T)
+    // [batch_size, seq_len_m, num_heads, seq_len_r, key_dim] *
+    //                [batch_size, seq_len_m, num_heads, m_size, key_dim]
+    // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size]
+    Tensor* qk_out = config->GetQKOut(dev_ctx_, softmax_out);
+    T* qk_out_ptr = qk_out->data<T>();
+
+    int64_t gemm_batch_size =
+        config->batch_size * config->seq_len_m * config->num_heads;
+    int64_t gemm_m = config->seq_len_r;
+    int64_t gemm_n = config->m_size;
+    int64_t gemm_k = config->key_dim;
+
+    T alpha = static_cast<T>(1.0 / sqrt(config->key_dim));
+    ComputeBatchedGEMM(q_ptr, k_ptr, qk_out_ptr, false, true, gemm_m, gemm_n,
+                       gemm_k, gemm_batch_size, alpha);
+
+    // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
+    ComputeBiasMaskSoftmaxForward(nonbatched_bias, src_mask, qk_out,
+                                  softmax_out);
+    config->ClearQKOut();
+
+    // qktv_out = BatchedGEMM(softmax_out, V)
+    // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] *
+    //               [batch_size, seq_len_m, num_heads, m_size, key_dim]
+    // -> [batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
+    Tensor qktv_out;
+    qktv_out.Resize(config->qktv_out_dims);
+    T* qktv_out_ptr = qktv_out.mutable_data<T>(dev_ctx_.GetPlace());
+
+    gemm_m = config->seq_len_r;
+    gemm_n = config->key_dim;
+    gemm_k = config->m_size;
+
+    T* softmax_out_ptr = softmax_out->data<T>();
+    ComputeBatchedGEMM(softmax_out_ptr, v_ptr, qktv_out_ptr, false, false,
+                       gemm_m, gemm_n, gemm_k, gemm_batch_size);
+
+    // fmha_out = transpose(qktv_out)
+    ComputeQKTVTransposeForward(qktv_out, fmha_out);
+  }
+
+  void ComputeBackward(const Tensor* q_transpose_out,
+                       const Tensor* k_transpose_out,
+                       const Tensor* v_transpose_out,
+                       const Tensor* qkv_transpose_out,
+                       const Tensor* softmax_out, const Tensor* fmha_out_grad,
+                       Tensor* src_mask_grad, Tensor* nonbatched_bias_grad,
+                       GateAttentionGradConfig<T>* config) {
+    const T* q_ptr = nullptr;
+    const T* k_ptr = nullptr;
+    const T* v_ptr = nullptr;
+
+    T* q_grad_ptr = nullptr;
+    T* k_grad_ptr = nullptr;
+    T* v_grad_ptr = nullptr;
+
+    Tensor q_transpose_out_grad;
+    Tensor k_transpose_out_grad;
+    Tensor v_transpose_out_grad;
+    Tensor qkv_transpose_out_grad;
+    if (merge_qkv_) {
+      PADDLE_ENFORCE_NOT_NULL(
+          qkv_transpose_out,
+          platform::errors::NotFound("The input qkv_transpose_out can not be "
+                                     "nullptr when merge_qkv is true."));
+
+      int64_t q_size = config->GetQuerySize();
+      q_ptr = qkv_transpose_out->data<T>();
+      k_ptr = q_ptr + q_size;
+      v_ptr = k_ptr + q_size;
+
+      qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims);
+
+      q_grad_ptr = qkv_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      k_grad_ptr = q_grad_ptr + q_size;
+      v_grad_ptr = k_grad_ptr + q_size;
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(
+          q_transpose_out,
+          platform::errors::NotFound("The input q_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          k_transpose_out,
+          platform::errors::NotFound("The input k_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+      PADDLE_ENFORCE_NOT_NULL(
+          v_transpose_out,
+          platform::errors::NotFound("The input v_transpose_out can not be "
+                                     "nullptr when merge_qkv is false."));
+
+      q_ptr = q_transpose_out->data<T>();
+      k_ptr = k_transpose_out->data<T>();
+      v_ptr = v_transpose_out->data<T>();
+
+      q_transpose_out_grad.Resize(config->q_transpose_out_dims);
+      k_transpose_out_grad.Resize(config->kv_transpose_out_dims);
+      v_transpose_out_grad.Resize(config->kv_transpose_out_dims);
+
+      q_grad_ptr = q_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      k_grad_ptr = k_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      v_grad_ptr = v_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+    }
+
+    Tensor softmax_out_grad;
+    softmax_out_grad.Resize(config->softmax_out_dims);
+    softmax_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+
+    int64_t gemm_batch_size =
+        config->batch_size * config->seq_len_m * config->num_heads;
+    {
+      // Forward: fmha_out = transpose(qktv_out)
+      Tensor qktv_out_grad;
+      qktv_out_grad.Resize(config->qktv_out_dims);
+      T* qktv_out_grad_ptr = qktv_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad);
+
+      // Forward: qktv_out = BatchedGEMM(softmax_out, V)
+      // Backward:
+      //  V_grad = BatchedGEMM(softmax_out^T, qktv_out_grad) (dy = x^T * dout)
+      int64_t gemm_m = config->m_size;
+      int64_t gemm_n = config->key_dim;
+      int64_t gemm_k = config->seq_len_r;
+
+      const T* softmax_out_ptr = softmax_out->data<T>();
+      ComputeBatchedGEMM(softmax_out_ptr, qktv_out_grad_ptr, v_grad_ptr, true,
+                         false, gemm_m, gemm_n, gemm_k, gemm_batch_size);
+
+      // Backward: softmax_out_grad = qktv_out_grad * V^T (dx = dout * y^T)
+      gemm_m = config->seq_len_r;
+      gemm_n = config->m_size;
+      gemm_k = config->key_dim;
+
+      T* softmax_out_grad_ptr = softmax_out_grad.data<T>();
+      ComputeBatchedGEMM(qktv_out_grad_ptr, v_ptr, softmax_out_grad_ptr, false,
+                         true, gemm_m, gemm_n, gemm_k, gemm_batch_size);
+    }
+
+    Tensor* qk_out_grad = config->GetQKOutGrad(dev_ctx_, &softmax_out_grad);
+    ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, softmax_out,
+                                   src_mask_grad, qk_out_grad,
+                                   nonbatched_bias_grad);
+
+    // Forward: qk_out = BatchedGEMM(Q, K^T)
+    // Backward: k_grad = BatchedGEMM(qk_out_grad^T, Q) (dy = dout^t * x)
+    int64_t gemm_m = config->m_size;
+    int64_t gemm_n = config->key_dim;
+    int64_t gemm_k = config->seq_len_r;
+    T alpha = static_cast<T>(1.0 / sqrt(config->key_dim));
+
+    T* qk_out_grad_ptr = qk_out_grad->data<T>();
+    ComputeBatchedGEMM(qk_out_grad_ptr, q_ptr, k_grad_ptr, true, false, gemm_m,
+                       gemm_n, gemm_k, gemm_batch_size, alpha);
+
+    // Backward: q_grad = BatchedGEMM(qk_out_grad, K) (dx = dout * y)
+    gemm_m = config->seq_len_r;
+    gemm_n = config->key_dim;
+    gemm_k = config->m_size;
+    ComputeBatchedGEMM(qk_out_grad_ptr, k_ptr, q_grad_ptr, false, false, gemm_m,
+                       gemm_n, gemm_k, gemm_batch_size, alpha);
+
+    if (merge_qkv_) {
+      Tensor* qkv_out_grad = config->GetQKVOutGrad(dev_ctx_);
+      ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad);
+    } else {
+      Tensor* q_out_grad = config->GetQueryOutGrad(dev_ctx_);
+      Tensor* k_out_grad = config->GetKeyOutGrad(dev_ctx_);
+      Tensor* v_out_grad = config->GetValueOutGrad(dev_ctx_);
+      ComputeQKVTransposeBackward(q_transpose_out_grad, k_transpose_out_grad,
+                                  v_transpose_out_grad, q_out_grad, k_out_grad,
+                                  v_out_grad);
+    }
+  }
+
+  void ComputeQKVTransposeForward(const Tensor& q_out, const Tensor& k_out,
+                                  const Tensor& v_out, Tensor* q_transpose_out,
+                                  Tensor* k_transpose_out,
+                                  Tensor* v_transpose_out) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_out, perm, q_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_out, perm, k_transpose_out);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_out, perm, v_transpose_out);
+  }
+
+  void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad,
+                                   const Tensor& k_transpose_out_grad,
+                                   const Tensor& v_transpose_out_grad,
+                                   Tensor* q_out_grad, Tensor* k_out_grad,
+                                   Tensor* v_out_grad) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, q_transpose_out_grad, perm,
+                                q_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, k_transpose_out_grad, perm,
+                                k_out_grad);
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, v_transpose_out_grad, perm,
+                                v_out_grad);
+  }
+
+  // [batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim] ->
+  //         [3, batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
+  void ComputeQKVTransposeForward(const Tensor& qkv_out,
+                                  Tensor* qkv_transpose_out) {
+    int ndims = 6;
+    std::vector<int> perm = {3, 0, 1, 4, 2, 5};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_out, perm,
+                                qkv_transpose_out);
+  }
+
+  void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad,
+                                   Tensor* qkv_out_grad) {
+    int ndims = 6;
+    std::vector<int> perm = {1, 2, 4, 0, 3, 5};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qkv_transpose_out_grad, perm,
+                                qkv_out_grad);
+  }
+
+  // [batch_size, seq_len_m, num_head, seq_len_r, c] ->
+  //         [batch_size, seq_len_m, seq_len_r, num_head, c]
+  void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, qktv_out, perm, fmha_out);
+  }
+
+  void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad,
+                                    Tensor* qktv_out_grad) {
+    int ndims = 5;
+    std::vector<int> perm = {0, 1, 3, 2, 4};
+    TransposeGPUKernelDriver<T>(dev_ctx_, ndims, fmha_out_grad, perm,
+                                qktv_out_grad);
+  }
+
+  // qk_out = qk_out + nonbatched_bias + src_mask
+  // softmax_out = softmax(src_mask_out)
+  void ComputeBiasMaskSoftmaxForward(const Tensor* nonbatched_bias,
+                                     const Tensor* src_mask, Tensor* qk_out,
+                                     Tensor* softmax_out) {
+    if (nonbatched_bias) {
+      std::vector<const Tensor*> ins = {qk_out, nonbatched_bias, src_mask};
+      std::vector<Tensor*> outs = {qk_out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+          dev_ctx_, ins, &outs, -1, TernaryAddFunctor<T>());
+    } else {
+      std::vector<const Tensor*> ins = {qk_out, src_mask};
+      std::vector<Tensor*> outs = {qk_out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+    }
+    phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out, -1, softmax_out);
+  }
+
+  // src_mask_out = qk_out + nonbatched_bias + src_mask
+  // softmax_out = softmax(src_mask_out)
+  void ComputeBiasMaskSoftmaxBackward(const Tensor* softmax_out_grad,
+                                      const Tensor* softmax_out,
+                                      Tensor* src_mask_grad,
+                                      Tensor* qk_out_grad,
+                                      Tensor* nonbatched_bias_grad) {
+    PADDLE_ENFORCE_NOT_NULL(
+        qk_out_grad,
+        platform::errors::NotFound("The qk_out_grad can not be nullptr."));
+
+    PADDLE_ENFORCE_EQ(qk_out_grad->dims(), softmax_out->dims(),
+                      platform::errors::InvalidArgument(
+                          "The shape of qk_out_grad and softmax_out is "
+                          "expected to be the same. But recieved qk_out_grad's "
+                          "shape = %s, softmax_out's shape = %s.",
+                          qk_out_grad->dims(), softmax_out->dims()));
+
+    PADDLE_ENFORCE_EQ(src_mask_grad, nullptr,
+                      platform::errors::InvalidArgument(
+                          "src_mask_grad is expected to be nullptr."));
+
+    phi::SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, *softmax_out,
+                                            *softmax_out_grad, -1, qk_out_grad);
+
+    // [1, bs, num_head, seq_l, seq_l] -> [bs, num_head, seq_l, seq_l]
+    if (nonbatched_bias_grad) {
+      gpuStream_t stream = dev_ctx_.stream();
+      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          dev_ctx_, *qk_out_grad, nonbatched_bias_grad,
+          kps::IdentityFunctor<T>(), {0, 1}, stream);
+    }
+  }
+
+ private:
+  void ComputeBatchedGEMM(const T* a_ptr, const T* b_ptr, T* c_ptr,
+                          bool trans_a, bool trans_b, int64_t m, int64_t n,
+                          int64_t k, int64_t batch_size,
+                          T alpha = static_cast<T>(1.0),
+                          T beta = static_cast<T>(0.0)) {
+    CBLAS_TRANSPOSE cblas_trans_a = trans_a ? CblasTrans : CblasNoTrans;
+    CBLAS_TRANSPOSE cblas_trans_b = trans_b ? CblasTrans : CblasNoTrans;
+    int64_t stride_a = m * k;
+    int64_t stride_b = k * n;
+
+    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    blas.BatchedGEMM(cblas_trans_a, cblas_trans_b, m, n, k, alpha, a_ptr, b_ptr,
+                     beta, c_ptr, batch_size, stride_a, stride_b);
+  }
+
+  const platform::CUDADeviceContext& dev_ctx_;
+  bool merge_qkv_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
new file mode 100644
index 0000000000000..ba9dbd82e3dcc
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -0,0 +1,317 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+class FusedGateAttentionOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Query"), "Input", "Query",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearWeight"), "Input", "OutLinearWeight",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias",
+                   "fused_gate_attention");
+
+    OP_INOUT_CHECK(ctx->HasOutput("SoftmaxOut"), "Output", "SoftmaxOut",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasOutput("FMHAOut"), "Output", "FMHAOut",
+                   "fused_gate_attention");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "fused_gate_attention");
+
+    auto input_q_dims = ctx->GetInputDim("Query");
+    int batch_size = input_q_dims[0];
+    int seq_len_m = input_q_dims[1];
+    int seq_len_r = input_q_dims[2];
+
+    int num_head, m_size, key_dim;
+    if (ctx->Attrs().Get<bool>("merge_qkv")) {
+      // QKV's input: [batch_size, seq_len_m, seq_len_r, qkv_dim]
+      // QKV's weight: [3, num_head, key_dim, qkv_dim]
+      OP_INOUT_CHECK(ctx->HasInput("QKVWeight"), "Input", "QKVWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasOutput("QKVTransposeOut"), "Output",
+                     "QKVTransposeOut", "fused_gate_attention");
+
+      auto qkv_w_dims = ctx->GetInputDim("QKVWeight");
+
+      num_head = qkv_w_dims[1];
+      key_dim = qkv_w_dims[2];
+      m_size = seq_len_r;
+
+      ctx->SetOutputDim("QKVTransposeOut", {3, batch_size, seq_len_m, num_head,
+                                            seq_len_r, key_dim});
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight",
+                     "fused_gate_attention");
+
+      auto input_k_dims = ctx->GetInputDim("Key");
+      auto q_w_dims = ctx->GetInputDim("QueryWeight");
+
+      num_head = q_w_dims[1];
+      key_dim = q_w_dims[2];
+      m_size = input_k_dims[2];
+
+      ctx->SetOutputDim("QueryTransposeOut",
+                        {batch_size, seq_len_m, num_head, seq_len_r, key_dim});
+      ctx->SetOutputDim("KeyTransposeOut",
+                        {batch_size, seq_len_m, num_head, m_size, key_dim});
+      ctx->SetOutputDim("ValueTransposeOut",
+                        {batch_size, seq_len_m, num_head, m_size, key_dim});
+    }
+
+    ctx->SetOutputDim("SoftmaxOut",
+                      {batch_size, seq_len_m, num_head, seq_len_r, m_size});
+    ctx->SetOutputDim("FMHAOut",
+                      {batch_size, seq_len_m, seq_len_r, num_head, key_dim});
+
+    if (ctx->Attrs().Get<bool>("has_gating")) {
+      OP_INOUT_CHECK(ctx->HasInput("GateWeight"), "Input", "GateWeight",
+                     "fused_gate_attention");
+      OP_INOUT_CHECK(ctx->HasInput("GateBias"), "Input", "GateBias",
+                     "fused_gate_attention");
+      ctx->SetOutputDim("GateOut",
+                        {batch_size, seq_len_m, seq_len_r, num_head, key_dim});
+    }
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Query"));
+  }
+};
+
+class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Query", "The query tensor.");
+    AddInput("Key", "The key tensor.").AsDispensable();
+    AddInput("QueryWeight", "(optional) The query weight tensor.")
+        .AsDispensable();
+    AddInput("KeyWeight", "(optional)  The key weight tensor.").AsDispensable();
+    AddInput("ValueWeight", "(optional)  The value weight tensor.")
+        .AsDispensable();
+    AddInput("QKVWeight", "(optional)  The qkv weight tensor.").AsDispensable();
+    AddInput("NonbatchedBias", "(optional) The nonbatchedBias tensor.")
+        .AsDispensable();
+    AddInput("SrcMask", "The attention mask tensor in fmha.");
+    AddInput("GateWeight", "(optional) The gate weight tensor.")
+        .AsDispensable();
+    AddInput("GateBias", "(optional) The gate bias tensor.").AsDispensable();
+    AddInput("OutLinearWeight", "The out_linear weight tensor.");
+    AddInput("OutLinearBias", "The out_linear bias tensor.");
+    AddOutput("QueryTransposeOut", "The transposed result of query matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("KeyTransposeOut", "The transposed result of key matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("ValueTransposeOut", "The transposed result of value matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("QKVTransposeOut", "The transposed result of merged QKV matmul.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("SoftmaxOut", "Result in fmha.").AsIntermediate();
+    AddOutput("FMHAOut", "Result in fmha.").AsIntermediate();
+    AddOutput("GateOut", "Result of the gating module.")
+        .AsIntermediate()
+        .AsDispensable();
+    AddOutput("Out", "Result after attention.");
+    AddAttr<bool>("has_gating",
+                  "if true, the attention op uses gate architecure, "
+                  "[default true].")
+        .SetDefault(true);
+    AddAttr<bool>("merge_qkv",
+                  "if true, calculation with merged qkv, "
+                  "[default true].")
+        .SetDefault(true);
+    AddComment(R"DOC(
+  Add fused attention op whose logic is as follows:
+  {
+    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) 
+    k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
+    v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
+
+    logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q * c , k) + bias
+    weights = nn.functional.softmax(logits)
+    weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
+    if nonbatched_bias is not None:
+      logits += paddle.unsqueeze(nonbatched_bias, axis=1)
+
+    if self.gating:
+        gate_values = paddle.einsum('nbqc,chv->nbqhv', q_data,
+                                    self.gating_w) + self.gating_b
+        gate_values_1 = nn.functional.sigmoid(gate_values)
+        weighted_avg *= gate_values_1
+    
+    output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
+                          self.output_w) + self.output_b
+                
+  }
+    )DOC");
+  }
+};
+
+class FusedGateAttentionGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Query"), "Input", "Query",
+                   "fused_gate_attention_grad");
+    if (ctx->HasOutput(framework::GradVarName("Query"))) {
+      ctx->SetOutputDim(framework::GradVarName("Query"),
+                        ctx->GetInputDim("Query"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Key"))) {
+      ctx->SetOutputDim(framework::GradVarName("Key"), ctx->GetInputDim("Key"));
+    }
+
+    if (ctx->Attrs().Get<bool>("merge_qkv")) {
+      OP_INOUT_CHECK(ctx->HasInput("QKVWeight"), "Input", "QKVWeight",
+                     "fused_gate_attention_arad");
+      ctx->SetOutputDim(framework::GradVarName("QKVWeight"),
+                        ctx->GetInputDim("QKVWeight"));
+    } else {
+      OP_INOUT_CHECK(ctx->HasInput("QueryWeight"), "Input", "QueryWeight",
+                     "fused_aate_attention_arad");
+      OP_INOUT_CHECK(ctx->HasInput("KeyWeight"), "Input", "KeyWeight",
+                     "fused_aate_attention_arad");
+      OP_INOUT_CHECK(ctx->HasInput("ValueWeight"), "Input", "ValueWeight",
+                     "fused_aate_attention_arad");
+
+      for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) {
+        ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
+      }
+    }
+
+    OP_INOUT_CHECK(ctx->HasInput("OutLinearWeight"), "Input", "OutLinearWeight",
+                   "fused_aate_attention_arad");
+
+    if (ctx->Attrs().Get<bool>("has_gating")) {
+      for (auto& name : {"GateWeight", "GateBias", "GateOut"}) {
+        ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
+      }
+    }
+
+    if (ctx->HasOutput(framework::GradVarName("NonbatchedBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("NonbatchedBias"),
+                        ctx->GetInputDim("NonbatchedBias"));
+    }
+
+    ctx->SetOutputDim(framework::GradVarName("FMHAOut"),
+                      ctx->GetInputDim("FMHAOut"));
+
+    ctx->SetOutputDim(framework::GradVarName("OutLinearWeight"),
+                      ctx->GetInputDim("OutLinearWeight"));
+    ctx->SetOutputDim(framework::GradVarName("OutLinearBias"),
+                      ctx->GetInputDim("OutLinearBias"));
+  }
+};
+
+template <typename T>
+class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_gate_attention_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    op->SetInput("Query", this->Input("Query"));
+    op->SetOutput(framework::GradVarName("Query"), this->InputGrad("Query"));
+
+    op->SetAttrMap(this->Attrs());
+    bool merge_qkv = BOOST_GET_CONST(bool, op->GetAttr("merge_qkv"));
+    if (merge_qkv) {
+      op->SetInput("QKVWeight", this->Input("QKVWeight"));
+      op->SetOutput(framework::GradVarName("QKVWeight"),
+                    this->InputGrad("QKVWeight"));
+      op->SetInput("QKVTransposeOut", this->Output("QKVTransposeOut"));
+    } else {
+      op->SetInput("Key", this->Input("Key"));
+      op->SetOutput(framework::GradVarName("Key"), this->InputGrad("Key"));
+
+      for (auto& name : {"QueryWeight", "KeyWeight", "ValueWeight"}) {
+        op->SetInput(name, this->Input(name));
+        op->SetOutput(framework::GradVarName(name), this->InputGrad(name));
+      }
+
+      for (auto& name :
+           {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut"}) {
+        op->SetInput(name, this->Output(name));
+      }
+    }
+
+    op->SetInput("FMHAOut", this->Output("FMHAOut"));
+    op->SetOutput(framework::GradVarName("FMHAOut"),
+                  this->OutputGrad("FMHAOut"));
+
+    if (this->HasInput("NonbatchedBias")) {
+      op->SetInput("NonbatchedBias", this->Input("NonbatchedBias"));
+      op->SetOutput(framework::GradVarName("NonbatchedBias"),
+                    this->InputGrad("NonbatchedBias"));
+    }
+
+    op->SetInput("SoftmaxOut", this->Output("SoftmaxOut"));
+
+    bool has_gating = BOOST_GET_CONST(bool, op->GetAttr("has_gating"));
+    if (has_gating) {
+      op->SetInput("GateWeight", this->Input("GateWeight"));
+      op->SetOutput(framework::GradVarName("GateWeight"),
+                    this->InputGrad("GateWeight"));
+
+      op->SetInput("GateBias", this->Input("GateBias"));
+      op->SetOutput(framework::GradVarName("GateBias"),
+                    this->InputGrad("GateBias"));
+
+      op->SetInput("GateOut", this->Output("GateOut"));
+      op->SetOutput(framework::GradVarName("GateOut"),
+                    this->OutputGrad("GateOut"));
+    }
+
+    op->SetInput("OutLinearWeight", this->Input("OutLinearWeight"));
+    op->SetOutput(framework::GradVarName("OutLinearWeight"),
+                  this->InputGrad("OutLinearWeight"));
+
+    op->SetInput("OutLinearBias", this->Input("OutLinearBias"));
+    op->SetOutput(framework::GradVarName("OutLinearBias"),
+                  this->InputGrad("OutLinearBias"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_gate_attention, ops::FusedGateAttentionOp,
+    ops::FusedGateAttentionOpMaker,
+    ops::FusedGateAttentionGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedGateAttentionGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_gate_attention_grad, ops::FusedGateAttentionGradOp);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
new file mode 100644
index 0000000000000..b1badf72557ae
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -0,0 +1,488 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fused_gate_attention.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct SigmoidMultiplyFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  // out = sigmoid(x) * y
+  inline HOSTDEVICE T operator()(T x, T y) const {
+    MPType x_mp = static_cast<MPType>(x);
+    T sigmoid_out = static_cast<T>(one / (one + exp(-x_mp)));
+    return sigmoid_out * y;
+  }
+};
+
+template <typename T>
+struct SigmoidMultiplyGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // Gradient of Multiply:
+  //  dx = dout * y
+  //  dy = dout * x
+  // Gradient of Sigmoid: dx = dout * out * (1 - out)
+  inline HOSTDEVICE phi::Array<T, 2> operator()(const T dout, const T x,
+                                                T y) const {
+    MPType x_mp = static_cast<MPType>(x);
+    T sigmoid_out = static_cast<T>(one / (one + exp(-x_mp)));
+    T d_sigmoid_out = dout * y;
+    phi::Array<T, 2> outs;
+    outs[0] = d_sigmoid_out * sigmoid_out *
+              (static_cast<T>(1.0f) - sigmoid_out);  // dx
+    outs[1] = dout * sigmoid_out;                    // dy
+    return outs;
+  }
+};
+
+template <typename T>
+void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *query, Tensor *qkv_out) {
+  // query: shape=[batch_size, seq_len_m, seq_len_r, qkv_dim]
+  // qkv_weight: shape=[3, num_heads, key_dim, qkv_dim]
+  // qkv_out: shape=[batch_size, seq_len_m, seq_len_r, 3, num_heads, key_dim]
+  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+  // qkv_out = GEMM(query, qkv_weight^T)
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = 3 * config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto qkv_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  qkv_compute.ComputeForward(qkv_weight, query, nullptr, qkv_out, nullptr);
+}
+
+template <typename T>
+Tensor *ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
+                                       const GateAttentionGradConfig<T> &config,
+                                       const Tensor *query,
+                                       const Tensor *qkv_out_grad,
+                                       Tensor *query_grad, bool use_addto) {
+  auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+  auto *qkv_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("QKVWeight"));
+  qkv_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  // Gradient of GEMM(query, qkv_weight)
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = 3 * config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto qkv_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
+  qkv_compute.ComputeBackward(query, qkv_weight, qkv_out_grad, query_grad,
+                              qkv_weight_grad, nullptr, use_addto);
+  return query_grad;
+}
+
+template <typename T>
+void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
+                                      const GateAttentionConfig<T> &config,
+                                      const Tensor *query, const Tensor *key,
+                                      Tensor *query_out, Tensor *key_out,
+                                      Tensor *value_out) {
+  auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+
+  // query_out = GEMM(query, query_weight)
+  // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
+  // query_weight: shape=[q_dim, num_heads, key_dim]
+  // query_out: shape=[batch_size, seq_len_m, seq_len_r, num_heads, key_dim]
+  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int q_n = config.num_heads * config.key_dim;
+  int q_k = config.q_dim;
+  auto q_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, q_m,
+                                 q_n, q_k, false);
+  q_compute.ComputeForward(query_weight, query, nullptr, query_out, nullptr);
+
+  // k_out = GEMM(key, key_weight)
+  // key: shape=[batch_size, seq_len_m, m_size, kv_dim]
+  // key_weight: shape=[kv_dim, num_heads, key_dim]
+  // key_out: shape=[batch_size, seq_len_m, m_size, num_heads, key_dim]
+  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  int kv_n = config.num_heads * config.key_dim;
+  int kv_k = config.kv_dim;
+  auto kv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, kv_m,
+                                  kv_n, kv_k, false);
+  kv_compute.ComputeForward(key_weight, key, nullptr, key_out, nullptr);
+
+  // value_out = GEMM(value, value_weight)
+  kv_compute.ComputeForward(value_weight, key, nullptr, value_out, nullptr);
+}
+
+template <typename T>
+Tensor *ComputeSeparatedQKVMatmulBackward(
+    const framework::ExecutionContext &ctx,
+    const GateAttentionGradConfig<T> &config, const Tensor *query,
+    const Tensor *key, const Tensor *query_out_grad, const Tensor *key_out_grad,
+    const Tensor *value_out_grad, Tensor *query_grad, Tensor *key_grad,
+    bool use_addto) {
+  // Gradient of GEMM(key, k_weight)
+  const auto *key_weight = ctx.Input<Tensor>("KeyWeight");
+  auto *key_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("KeyWeight"));
+  key_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  int kv_m = config.batch_size * config.seq_len_m * config.m_size;
+  int kv_n = config.num_heads * config.key_dim;
+  int kv_k = config.kv_dim;
+  auto kv_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, kv_m,
+                                  kv_n, kv_k, false);
+  kv_compute.ComputeBackward(key, key_weight, key_out_grad, key_grad,
+                             key_weight_grad, nullptr, false);
+
+  // Gradient of GEMM(value, v_weight)
+  auto *value_weight = ctx.Input<Tensor>("ValueWeight");
+  auto *value_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("ValueWeight"));
+  value_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  kv_compute.ComputeBackward(key, value_weight, value_out_grad, key_grad,
+                             value_weight_grad, nullptr, true);
+
+  // Gradient of GEMM(query, query_weight)
+  const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+  auto *query_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("QueryWeight"));
+  query_weight_grad->mutable_data<T>(ctx.GetPlace());
+
+  int q_m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int q_n = config.num_heads * config.key_dim;
+  int q_k = config.q_dim;
+  auto q_compute = AttnMatMul<T>(ctx.cuda_device_context(), false, false, q_m,
+                                 q_n, q_k, false);
+  q_compute.ComputeBackward(query, query_weight, query_out_grad, query_grad,
+                            query_weight_grad, nullptr, use_addto);
+  return query_grad;
+}
+
+template <typename T>
+Tensor *ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *query,
+                                   const Tensor *fmha_out) {
+  auto *gate_weight = ctx.Input<Tensor>("GateWeight");
+  auto *gate_bias = ctx.Input<Tensor>("GateBias");
+
+  auto *gate_out = ctx.Output<Tensor>("GateOut");
+  gate_out->mutable_data<T>(ctx.GetPlace());
+  VLOG(4) << "[ComputeGatingLinearForward] gate_out: "
+          << MemoryDebugString(*gate_out);
+
+  // The first gate_bias_out stores the result of the multiplication,
+  // and the second gate_bias_out stores the result of the multiplication +
+  // bias.
+  //   gate_out = GEMM(query, gate_weight) + gate_bias
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto gate_attn_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  gate_attn_compute.ComputeForward(gate_weight, query, gate_bias, gate_out,
+                                   gate_out);
+
+  // gate_out = sigmoid(gate_out) * fmha_out
+  std::vector<const Tensor *> ins = {gate_out, fmha_out};
+  std::vector<Tensor *> outs = {gate_out};
+  phi::funcs::ElementwiseKernel<T>(ctx.cuda_device_context(), ins, &outs,
+                                   SigmoidMultiplyFunctor<T>());
+  return gate_out;
+}
+
+template <typename T>
+Tensor *ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    const Tensor *fmha_out,
+                                    const Tensor *gate_out_grad,
+                                    Tensor *query_grad, Tensor *fmha_out_grad) {
+  const auto *query = ctx.Input<Tensor>("Query");
+  const auto *gate_weight = ctx.Input<Tensor>("GateWeight");
+  const auto *gate_bias = ctx.Input<Tensor>("GateBias");
+
+  // Re-compute gate_bias_out
+  Tensor gate_bias_out;
+  gate_bias_out.Resize(config.gate_out_dims);
+  gate_bias_out.mutable_data<T>(ctx.GetPlace());
+
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.num_heads * config.key_dim;
+  int k = config.q_dim;
+  auto gate_attn_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  gate_attn_compute.ComputeForward(gate_weight, query, gate_bias,
+                                   &gate_bias_out, &gate_bias_out);
+
+  // Gradient of sigmoid(gate_bias_out) * fmha_out
+  // Compute inplace and save gate_bias_out_grad to gate_bias_out.
+  std::vector<const Tensor *> ins = {gate_out_grad, &gate_bias_out, fmha_out};
+  std::vector<Tensor *> outs = {&gate_bias_out, fmha_out_grad};
+  phi::funcs::ElementwiseKernel<T, SigmoidMultiplyGradFunctor<T>, 2>(
+      ctx.cuda_device_context(), ins, &outs, SigmoidMultiplyGradFunctor<T>());
+
+  // Gradient of GEMM(query, gate_weight) + gate_bias
+  auto *gate_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("GateWeight"));
+  auto *gate_bias_grad = ctx.Output<Tensor>(framework::GradVarName("GateBias"));
+  gate_weight_grad->mutable_data<T>(ctx.GetPlace());
+  gate_bias_grad->mutable_data<T>(ctx.GetPlace());
+
+  gate_attn_compute.ComputeBackward(query, gate_weight, &gate_bias_out,
+                                    query_grad, gate_weight_grad,
+                                    gate_bias_grad);
+  return fmha_out_grad;
+}
+
+template <typename T>
+Tensor *ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
+                                   const GateAttentionConfig<T> &config,
+                                   const Tensor *fmha_or_gate_out) {
+  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
+
+  auto *out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+  VLOG(4) << "[ComputeOutputLinearForward] out: " << MemoryDebugString(*out);
+
+  // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.q_dim;
+  int k = config.num_heads * config.key_dim;
+  auto out_linear_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  out_linear_compute.ComputeForward(out_linear_weight, fmha_or_gate_out,
+                                    out_linear_bias, out, out);
+  return out;
+}
+
+template <typename T>
+Tensor *ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    bool has_gating) {
+  std::string input_name = has_gating ? "GateOut" : "FMHAOut";
+
+  const auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
+  const auto *input = ctx.Input<Tensor>(input_name);
+
+  auto *out_linear_weight_grad =
+      ctx.Output<Tensor>(framework::GradVarName("OutLinearWeight"));
+  auto *out_linear_bias_grad =
+      ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
+  auto *input_grad = ctx.Output<Tensor>(framework::GradVarName(input_name));
+
+  out_linear_weight_grad->mutable_data<T>(ctx.GetPlace());
+  out_linear_bias_grad->mutable_data<T>(ctx.GetPlace());
+  input_grad->mutable_data<T>(ctx.GetPlace());
+
+  int m = config.batch_size * config.seq_len_m * config.seq_len_r;
+  int n = config.q_dim;
+  int k = config.num_heads * config.key_dim;
+  auto out_linear_compute =
+      AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
+  out_linear_compute.ComputeBackward(input, out_linear_weight, out_grad,
+                                     input_grad, out_linear_weight_grad,
+                                     out_linear_bias_grad);
+  return input_grad;
+}
+
+template <typename T>
+class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *query = ctx.Input<Tensor>("Query");
+    const auto *key = ctx.Input<Tensor>("Key");
+    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+    const auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    const auto *nonbatched_bias = ctx.Input<Tensor>("NonbatchedBias");
+
+    auto *q_transpose_out = ctx.Output<Tensor>("QueryTransposeOut");
+    auto *k_transpose_out = ctx.Output<Tensor>("KeyTransposeOut");
+    auto *v_transpose_out = ctx.Output<Tensor>("ValueTransposeOut");
+    auto *qkv_transpose_out = ctx.Output<Tensor>("QKVTransposeOut");
+
+    auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
+    auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+
+    const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
+    const bool has_gating = ctx.Attr<bool>("has_gating");
+
+    // When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged.
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    GateAttentionConfig<T> config(query, key, query_weight, qkv_weight,
+                                  merge_qkv);
+
+    if (merge_qkv) {
+      // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
+      Tensor *qkv_out = config.GetQKVOut(dev_ctx);
+      ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
+
+      qkv_transpose_out->mutable_data<T>(ctx.GetPlace());
+      VLOG(4) << "qkv_transpose_out:" << MemoryDebugString(*qkv_transpose_out);
+    } else {
+      // 1. Separated QKV Matmul
+      Tensor *query_out = config.GetQueryOut(dev_ctx);
+      Tensor *key_out = config.GetKeyOut(dev_ctx);
+      Tensor *value_out = config.GetValueOut(dev_ctx);
+      ComputeSeparatedQKVMatmulForward<T>(ctx, config, query, key, query_out,
+                                          key_out, value_out);
+
+      q_transpose_out->mutable_data<T>(ctx.GetPlace());
+      k_transpose_out->mutable_data<T>(ctx.GetPlace());
+      v_transpose_out->mutable_data<T>(ctx.GetPlace());
+      VLOG(4) << "q_transpose_out: " << MemoryDebugString(*q_transpose_out);
+      VLOG(4) << "k_transpose_out: " << MemoryDebugString(*k_transpose_out);
+      VLOG(4) << "v_transpose_out: " << MemoryDebugString(*v_transpose_out);
+    }
+
+    softmax_out->mutable_data<T>(ctx.GetPlace());
+    fmha_out->mutable_data<T>(ctx.GetPlace());
+    VLOG(4) << "softmax_out: " << MemoryDebugString(*softmax_out);
+    VLOG(4) << "fmha_out: " << MemoryDebugString(*fmha_out);
+
+    // 2. FMHA
+    auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
+    fmha_compute.ComputeForward(
+        nonbatched_bias, src_mask, q_transpose_out, k_transpose_out,
+        v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config);
+
+    // 3. Gating Linear
+    Tensor *fmha_or_gate_out =
+        !has_gating ? fmha_out : ComputeGatingLinearForward<T>(ctx, config,
+                                                               query, fmha_out);
+
+    // 4. Output Linear
+    ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out);
+  }
+};
+
+template <typename T>
+class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto has_gating = ctx.Attr<bool>("has_gating");
+    const auto merge_qkv = ctx.Attr<bool>("merge_qkv");
+
+    // forward input
+    const auto *query = ctx.Input<Tensor>("Query");
+    const auto *key = ctx.Input<Tensor>("Key");
+    const auto *query_weight = ctx.Input<Tensor>("QueryWeight");
+    const auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
+
+    // forward output, backward input
+    const auto *q_transpose_out = ctx.Input<Tensor>("QueryTransposeOut");
+    const auto *k_transpose_out = ctx.Input<Tensor>("KeyTransposeOut");
+    const auto *v_transpose_out = ctx.Input<Tensor>("ValueTransposeOut");
+    const auto *qkv_transpose_out = ctx.Input<Tensor>("QKVTransposeOut");
+    const auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
+    const auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+
+    // backward output
+    auto *query_grad = ctx.Output<Tensor>(framework::GradVarName("Query"));
+    query_grad->mutable_data<T>(ctx.GetPlace());
+    auto *nonbatched_bias_grad =
+        ctx.Output<Tensor>(framework::GradVarName("NonbatchedBias"));
+    auto *fmha_out_grad = ctx.Output<Tensor>(framework::GradVarName("FMHAOut"));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    GateAttentionGradConfig<T> config(query, key, query_weight, qkv_weight,
+                                      merge_qkv);
+
+    // 1. Gradient of Output Linear
+    Tensor *fhma_or_gate_out_grad =
+        ComputeOutputLinearBackward<T>(ctx, config, has_gating);
+
+    // 2. Gradient of Gating Linear
+    if (has_gating) {
+      // fhma_or_gate_out_grad is actually gate_out_grad.
+      fmha_out_grad->mutable_data<T>(ctx.GetPlace());
+      ComputeGatingLinearBackward<T>(ctx, config, fmha_out,
+                                     fhma_or_gate_out_grad, query_grad,
+                                     fmha_out_grad);
+    }
+
+    // 3. Gradient of FMHA
+    if (nonbatched_bias_grad) {
+      nonbatched_bias_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
+    fmha_compute.ComputeBackward(
+        q_transpose_out, k_transpose_out, v_transpose_out, qkv_transpose_out,
+        softmax_out, fmha_out_grad, nullptr, nonbatched_bias_grad, &config);
+
+    bool use_addto = has_gating ? true : false;
+    if (merge_qkv) {
+      // 4. Gradient of Merged QKV Matmul
+      Tensor *qkv_out_grad = config.GetQKVOutGrad(dev_ctx);
+      ComputeMergedQKVMatmulBackward<T>(ctx, config, query, qkv_out_grad,
+                                        query_grad, use_addto);
+    } else {
+      // 4. Gradient of Separated QKV Matmul
+      auto *key_grad = ctx.Output<Tensor>(framework::GradVarName("Key"));
+      if (key_grad) {
+        key_grad->mutable_data<T>(ctx.GetPlace());
+      }
+      Tensor *query_out_grad = config.GetQueryOutGrad(dev_ctx);
+      Tensor *key_out_grad = config.GetKeyOutGrad(dev_ctx);
+      Tensor *value_out_grad = config.GetValueOutGrad(dev_ctx);
+      ComputeSeparatedQKVMatmulBackward<T>(
+          ctx, config, query, key, query_out_grad, key_out_grad, value_out_grad,
+          query_grad, key_grad, use_addto);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
+                        ops::FusedGateAttentionOpKernel<float>,
+                        ops::FusedGateAttentionOpKernel<plat::float16>,
+                        ops::FusedGateAttentionOpKernel<plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
+                        ops::FusedGateAttentionGradKernel<float>,
+                        ops::FusedGateAttentionGradKernel<plat::float16>,
+                        ops::FusedGateAttentionGradKernel<plat::bfloat16>);
+#else
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention,
+                        ops::FusedGateAttentionOpKernel<float>,
+                        ops::FusedGateAttentionOpKernel<double>,
+                        ops::FusedGateAttentionOpKernel<plat::float16>,
+                        ops::FusedGateAttentionOpKernel<plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(fused_gate_attention_grad,
+                        ops::FusedGateAttentionGradKernel<float>,
+                        ops::FusedGateAttentionGradKernel<double>,
+                        ops::FusedGateAttentionGradKernel<plat::float16>,
+                        ops::FusedGateAttentionGradKernel<plat::bfloat16>);
+#endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 5410638ceb39a..8c04e935134c7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -225,9 +225,9 @@ class RecordedGpuMallocHelper {
     if (UNLIKELY(malloc_managed_memory)) {
       result = cudaMallocManaged(ptr, size);
     } else {
-      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
-               << " MB";
       result = cudaMalloc(ptr, size);
+      VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
+               << " MB, result=" << result;
     }
 #endif
     if (result == gpuSuccess) {
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 2b849968c76f9..df32f65a794f3 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -32,6 +32,10 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"fused_attention",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "SrcMask",
       "OutLinearW", "OutLinearBias", "Ln2Scale", "Ln2Bias"}},
+    {"fused_gate_attention",
+     {"Query", "Key", "QueryWeight", "KeyWeight", "ValueWeight", "QKVWeight",
+      "NonbatchedBias", "SrcMask", "GateWeight", "GateBias", "OutLinearWeight",
+      "OutLinearBias"}},
     {"fused_multi_transformer",
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "TimeStep",
       "SrcMask", "OutLinearW", "OutLinearBias", "FFNLnScale", "FFNLnBias",
@@ -148,6 +152,9 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
                          "DropoutMaskOut", "Ln2Mean",
                          "Ln2Variance",    "BiasDropoutResidualOut",
                          "CacheKVOut",     "Y"}},
+    {"fused_gate_attention",
+     {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut",
+      "QKVTransposeOut", "SoftmaxOut", "FMHAOut", "GateOut", "Out"}},
     {"sync_batch_norm",
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 77159bfc876da..58781e8c6e491 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -888,19 +888,6 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
 #endif
 }
 
-template <typename T>
-static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) {
-  if (dev_ctx.cudnn_handle() != nullptr) {
-    if (std::is_same<T, phi::dtype::bfloat16>::value) {
-#if CUDNN_VERSION < 8100
-      return false;
-#endif
-    }
-    return true;
-  }
-  return false;
-}
-
 #if CUDNN_VERSION < 8100
 template <>
 inline void SoftmaxForwardCudnnKernel<phi::dtype::bfloat16>(
@@ -927,6 +914,25 @@ inline void SoftmaxBackwardCudnnKernel<phi::dtype::bfloat16>(
 }
 #endif
 
+template <typename T>
+bool UseCudnnSoftmax(const GPUContext& ctx, int softmax_dim, bool last_dim) {
+  bool cudnn_available = ctx.cudnn_handle();
+  if (!ctx.cudnn_handle()) {
+    if (std::is_same<T, phi::dtype::bfloat16>::value) {
+#if CUDNN_VERSION < 8100
+      cudnn_available = false;
+#endif
+    }
+  }
+  constexpr int max_dim = 512;
+  if (!cudnn_available || !last_dim ||
+      (softmax_dim <= max_dim && sizeof(T) <= 4)) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
 template <typename T, bool LogMode = false>
 void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
                                     const DenseTensor& x,
@@ -941,10 +947,7 @@ void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx,
   int dim = tensor_dims[1];
   int D = tensor_dims[2];
 
-  constexpr int max_dim = 512;
-
-  if (D == 1 &&
-      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+  if (D == 1 && !UseCudnnSoftmax<T>(dev_ctx, dim, true)) {
     int dim_log2 = static_cast<int>(Log2Ceil(dim));
     int dim_ceil = 1 << dim_log2;
     int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
@@ -1016,10 +1019,7 @@ void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx,
   int dim = tensor_dims[1];
   int D = tensor_dims[2];
 
-  constexpr int max_dim = 512;
-
-  if (D == 1 &&
-      (!CanUseCudnnSoftmax<T>(dev_ctx) || (dim <= max_dim && sizeof(T) <= 4))) {
+  if (D == 1 && !UseCudnnSoftmax<T>(dev_ctx, dim, true)) {
     int dim_log2 = Log2Ceil(dim);
     int dim_ceil = 1 << dim_log2;
     int warp_size = (dim_ceil < 32) ? dim_ceil : 32;
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a78c820e1e66a..e0cd0c4bf4d41 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -327,6 +327,7 @@ if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_gate_attention_op)
     LIST(REMOVE_ITEM TEST_OPS test_boxps)
 endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
new file mode 100644
index 0000000000000..6f9ba5f5e4e57
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle import tensor
+import unittest
+from op_test import OpTest, convert_float_to_uint16
+from test_sparse_attention_op import get_cuda_version
+from paddle import _C_ops
+from paddle.fluid.framework import default_main_program
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle is not compiled with CUDA")
+class TestFusedGateAttentionOp(OpTest):
+    def setUp(self):
+        self.__class__.op_type = "fused_gate_attention"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+        self.config()
+        self.merge_qkv = self.q_dim == self.kv_dim
+        self.generate_input_data()
+
+    def config(self):
+        self.dtype = "float32"
+        self.has_gating = True
+        self.batch_size = 1
+        self.msa_len = 3
+        self.res_len = 5
+        self.q_dim = 6
+        self.num_heads = 2
+        self.key_dim = 4
+        self.m_size = self.res_len
+        self.kv_dim = self.q_dim
+        self.out_dim = self.q_dim
+        self.bias_attr = True
+
+    def generate_input_data(self):
+        def _random(shape):
+            if self.dtype == "bfloat16":
+                data = np.random.random(shape).astype("float32")
+                return convert_float_to_uint16(data)
+            else:
+                return np.random.random(shape).astype(self.dtype)
+
+        np.random.seed(123)
+        self.query = _random(
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+        self.q_weight = _random((self.q_dim, self.num_heads, self.key_dim))
+        self.k_weight = _random((self.kv_dim, self.num_heads, self.key_dim))
+        self.v_weight = _random((self.kv_dim, self.num_heads, self.key_dim))
+        if self.merge_qkv:
+            self.key = None
+            # (3, self.num_heads, self.key_dim, self.q_dim)
+            q_weight_t = np.transpose(self.q_weight, axes=[1, 2, 0])
+            k_weight_t = np.transpose(self.k_weight, axes=[1, 2, 0])
+            v_weight_t = np.transpose(self.v_weight, axes=[1, 2, 0])
+            self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
+        else:
+            self.key = _random(
+                (self.batch_size, self.msa_len, self.m_size, self.kv_dim))
+            self.qkv_weight = None
+
+        self.attn_mask = _random(
+            (self.batch_size, self.msa_len, 1, 1, self.m_size))
+
+        if self.bias_attr:
+            self.nonbatched_bias = _random(
+                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
+
+        if self.has_gating:
+            self.gating_w = _random((self.q_dim, self.num_heads, self.key_dim))
+            self.gating_b = _random((self.num_heads, self.key_dim))
+
+        self.output_w = _random((self.num_heads, self.key_dim, self.out_dim))
+        self.output_b = _random((self.out_dim))
+
+        self.dout = _random(
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+
+    def get_reference_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        query = paddle.to_tensor(self.query, stop_gradient=False)
+        key = query if self.merge_qkv else paddle.to_tensor(
+            self.key, stop_gradient=False)
+        q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
+        k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
+        v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
+        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        c = self.key_dim**(-0.5)
+        # [batch_size, msa_len, num_heads, res_len, key_dim]
+        q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
+        # [batch_size, msa_len, num_heads, m_size, key_dim]
+        k = paddle.einsum('nbka,ahc->nbkhc', key, k_weight)
+        # [batch_size, msa_len, num_heads, m_size, key_dim]
+        v = paddle.einsum('nbka,ahc->nbkhc', key, v_weight)
+
+        # [batch_size, msa_len, num_heads, res_len, m_size] 
+        logits = paddle.einsum('nbqhc,nbkhc->nbhqk', q, k)  # qk_out
+        logits = logits + src_mask
+        if self.bias_attr:
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False)
+            logits = logits + nonbatched_bias
+
+        weights = nn.functional.softmax(logits)  # softmax_out
+        weighted_avg = paddle.einsum('nbhqk,nbkhc->nbqhc', weights, v)
+
+        if self.has_gating:
+            gating_w = paddle.to_tensor(self.gating_w, stop_gradient=False)
+            gating_b = paddle.to_tensor(self.gating_b, stop_gradient=False)
+            gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
+                                        gating_w) + gating_b
+            gate_values = nn.functional.sigmoid(gate_values)
+            weighted_avg = weighted_avg * gate_values
+
+        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
+        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
+
+        out = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
+                            output_w) + output_b
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if self.merge_qkv:
+            return out, query.grad, None
+        else:
+            return out, query.grad, key.grad
+
+    def get_fused_gate_attention_out(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        query = paddle.to_tensor(self.query, stop_gradient=False)
+        if self.merge_qkv:
+            key = None
+            q_weight = None
+            k_weight = None
+            v_weight = None
+            qkv_weight = paddle.to_tensor(self.qkv_weight, stop_gradient=False)
+        else:
+            key = paddle.to_tensor(self.key, stop_gradient=False)
+            q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
+            k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
+            v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
+            qkv_weight = None
+
+        src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+
+        if self.bias_attr:
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False)
+        else:
+            nonbatched_bias = None
+        if self.has_gating:
+            gating_w = paddle.to_tensor(self.gating_w, stop_gradient=False)
+            gating_b = paddle.to_tensor(self.gating_b, stop_gradient=False)
+        else:
+            gating_w = None
+            gating_b = None
+
+        output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
+        output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
+
+        _, _, _, _, _, _, _, out = _C_ops.fused_gate_attention(
+            query, key, q_weight, k_weight, v_weight, qkv_weight,
+            nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
+            'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if key is not None:
+            return out, query.grad, key.grad
+        else:
+            return out, query.grad, None
+
+    def check_output_and_grad(self, atol, rtol):
+        out_ref, query_grad_ref, key_grad_ref = self.get_reference_out()
+        out, query_grad, key_grad = self.get_fused_gate_attention_out()
+        np.testing.assert_allclose(out_ref, out.numpy(), atol=atol, rtol=rtol)
+        np.testing.assert_allclose(
+            query_grad_ref, query_grad.numpy(), atol=atol, rtol=rtol)
+        if key_grad_ref is not None and key_grad is not None:
+            np.testing.assert_allclose(
+                key_grad_ref, key_grad.numpy(), atol=atol, rtol=rtol)
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-5, rtol=1e-5)
+
+
+class TestSeparatedQKVCase(TestFusedGateAttentionOp):
+    def config(self):
+        self.dtype = "float32"
+        self.has_gating = False
+        self.batch_size = 1
+        self.msa_len = 3
+        self.res_len = 5
+        self.q_dim = 6
+        self.num_heads = 2
+        self.key_dim = 4
+        self.m_size = 4
+        self.kv_dim = 2
+        self.out_dim = self.q_dim
+        self.bias_attr = False
+
+
+class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.has_gating = False
+        self.bias_attr = False
+
+
+class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.dtype = "float16"
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-1, rtol=1e-5)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+)
+class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
+    def config(self):
+        super().config()
+        self.dtype = "bfloat16"
+
+    def test_output_and_grad(self):
+        self.check_output_and_grad(atol=1e-1, rtol=1e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 3d56d41918f2d58e0dcb190b450318228b04afcb Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 30 May 2022 19:08:09 +0800
Subject: [PATCH 075/109] add backward inplace api (#42965)

---
 python/paddle/utils/code_gen/backward.yaml | 47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 360425a30ccad..b27c3aab6bb37 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -32,6 +32,7 @@
     param : [x]
   kernel :
     func : acos_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : acosh_grad
   forward : acosh (Tensor x) -> Tensor(out)
@@ -42,6 +43,7 @@
     param : [x]
   kernel :
     func : acosh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : add_double_grad
   forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
@@ -115,6 +117,7 @@
     param : [x]
   kernel :
     func : asin_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : asinh_grad
   forward : asinh (Tensor x) -> Tensor(out)
@@ -125,6 +128,7 @@
     param : [x]
   kernel :
     func : asinh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : assign_grad
   forward : assign (Tensor x) -> Tensor(out)
@@ -134,6 +138,7 @@
     func : UnchangedInferMeta
   kernel :
     func : assign
+  inplace : (out_grad -> x_grad)
 
 - backward_api : assign_out__grad
   forward : assign_out_ (Tensor x, Tensor output) -> Tensor(out)
@@ -143,6 +148,7 @@
     func : UnchangedInferMeta
   kernel :
     func : assign
+  inplace : (out_grad -> x_grad)
 
 - backward_api : atan2_grad
   forward : atan2 (Tensor x, Tensor y) -> Tensor(out)
@@ -163,6 +169,7 @@
     param : [x]
   kernel :
     func : atan_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : atanh_grad
   forward : atanh (Tensor x) -> Tensor(out)
@@ -173,6 +180,7 @@
     param : [x]
   kernel :
     func : atanh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : batch_norm_double_grad
   forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
@@ -208,6 +216,7 @@
     param : [input]
   kernel :
     func : bce_loss_grad
+  inplace : (out_grad -> input_grad)
 
 - backward_api : brelu_grad
   forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out)
@@ -218,6 +227,7 @@
     param : [x]
   kernel :
     func : brelu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cast_grad
   forward : cast (Tensor x, DataType out_dtype) -> Tensor(out)
@@ -240,6 +250,7 @@
     param: [out_grad]
   kernel :
     func : ceil_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : celu_double_grad
   forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
@@ -261,6 +272,7 @@
   kernel :
     func : celu_grad
   backward : celu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cholesky_grad
   forward : cholesky (Tensor x, bool upper) -> Tensor(out)
@@ -302,6 +314,7 @@
   kernel :
     func : clip_grad
   backward : clip_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : concat_double_grad
   forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
@@ -394,6 +407,7 @@
     param : [x]
   kernel :
     func : cos_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cosh_grad
   forward : cosh (Tensor x) -> Tensor(out)
@@ -404,6 +418,7 @@
     param : [x]
   kernel :
     func : cosh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : cross_entropy_with_softmax_grad
   forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis) -> Tensor(softmax), Tensor(loss)
@@ -592,6 +607,7 @@
   kernel :
     func : elu_grad
   backward : elu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
@@ -623,6 +639,7 @@
     param : [out]
   kernel :
     func : exp_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : expand_as_grad
   forward : expand_as (Tensor x, Tensor y, int[] target_shape) -> Tensor(out)
@@ -665,6 +682,7 @@
     param : [out]
   kernel :
     func : expm1_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : flatten_grad
   forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape)
@@ -699,6 +717,7 @@
     param: [out_grad]
   kernel :
     func : floor_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : fmax_grad
   forward : fmax(Tensor x, Tensor y, int axis) -> Tensor(out)
@@ -794,6 +813,7 @@
     param : [x]
   kernel :
     func : hard_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : hard_sigmoid_grad
   forward : hard_sigmoid (Tensor x, float slope, float offset) -> Tensor(out)
@@ -804,6 +824,7 @@
     param : [out]
   kernel :
     func : hard_sigmoid_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : hard_swish_grad
   forward : hard_swish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out)
@@ -814,6 +835,7 @@
     param : [x]
   kernel :
     func : hard_swish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : huber_loss_grad
   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
@@ -930,6 +952,7 @@
   kernel :
     func : leaky_relu_grad
   backward : leaky_relu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : lerp_grad
   forward : lerp (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
@@ -960,6 +983,7 @@
     param : [x]
   kernel :
     func : log10_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log1p_grad
   forward : log1p (Tensor x) -> Tensor(out)
@@ -970,6 +994,7 @@
     param : [x]
   kernel :
     func : log1p_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log2_grad
   forward : log2 (Tensor x) -> Tensor(out)
@@ -980,6 +1005,7 @@
     param : [x]
   kernel :
     func : log2_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log_double_grad
   forward : log_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
@@ -1001,6 +1027,7 @@
   kernel :
     func : log_grad
   backward : log_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : log_loss_grad
   forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out)
@@ -1041,6 +1068,7 @@
     param : [x]
   kernel :
     func : logsigmoid_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : logsumexp_grad
   forward : logsumexp(Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all) -> Tensor(out)
@@ -1222,6 +1250,7 @@
     param : [x]
   kernel :
     func : mish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : mode_grad
   forward : mode(Tensor x,  int axis,  bool keepdim) -> Tensor(out), Tensor(indices)
@@ -1451,6 +1480,7 @@
     param: [x]
   kernel :
     func : pow_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : prelu_grad
   forward : prelu(Tensor x, Tensor alpha, str data_format, str mode) -> Tensor(out)
@@ -1500,6 +1530,7 @@
     param : [out]
   kernel :
     func : reciprocal_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : reduce_prod_grad
   forward : reduce_prod (Tensor x, int64_t[] dims, bool keep_dim, bool reduce_all) -> Tensor(out)
@@ -1531,6 +1562,7 @@
   kernel :
     func : relu_grad
   backward: relu_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : reshape_double_grad
   forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x)
@@ -1605,6 +1637,7 @@
     param: [out_grad]
   kernel :
     func : round_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : rsqrt_double_grad
   forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
@@ -1626,6 +1659,7 @@
   kernel :
     func : rsqrt_grad
   backward : rsqrt_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : scale_double_grad
   forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
@@ -1701,6 +1735,7 @@
     param : [x]
   kernel :
     func : sigmoid_cross_entropy_with_logits_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sigmoid_double_grad
   forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
@@ -1723,6 +1758,7 @@
   kernel :
     func : sigmoid_grad
   backward : sigmoid_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sigmoid_triple_grad
   forward : sigmoid_double_grad (Tensor out, Tensor fwd_grad_out, Tensor grad_grad_x) -> Tensor(grad_out), Tensor(grad_grad_out)
@@ -1744,6 +1780,7 @@
     param : [x]
   kernel :
     func : silu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sin_grad
   forward : sin (Tensor x) -> Tensor(out)
@@ -1754,6 +1791,7 @@
     param : [x]
   kernel :
     func : sin_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : sinh_grad
   forward : sinh (Tensor x) -> Tensor(out)
@@ -1764,6 +1802,7 @@
     param : [x]
   kernel :
     func : sinh_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : slice_grad
   forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out)
@@ -1785,6 +1824,7 @@
     param : [x]
   kernel :
     func : soft_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : softmax_grad
   forward : softmax (Tensor x, int axis) -> Tensor(out)
@@ -1824,6 +1864,7 @@
   kernel :
     func : sqrt_grad
   backward : sqrt_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : square_double_grad
   forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
@@ -1845,6 +1886,7 @@
   kernel :
     func : square_grad
   backward : square_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : squeeze_double_grad
   forward : squeeze_grad(Tensor xshape, Tensor grad_out, int[] axes) -> Tensor(grad_x)
@@ -1946,6 +1988,7 @@
     param : [x]
   kernel :
     func : swish_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : take_along_axis_grad
   forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
@@ -1966,6 +2009,7 @@
     param : [x]
   kernel :
     func : tan_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_double_grad
   forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
@@ -1988,6 +2032,7 @@
   kernel :
     func : tanh_grad
   backward : tanh_double_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_shrink_grad
   forward : tanh_shrink (Tensor x) -> Tensor(out)
@@ -1998,6 +2043,7 @@
     param : [x]
   kernel :
     func : tanh_shrink_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tanh_triple_grad
   forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
@@ -2018,6 +2064,7 @@
     param : [x]
   kernel :
     func : thresholded_relu_grad
+  inplace : (out_grad -> x_grad)
 
 - backward_api : tile_double_grad
   forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)

From ed2886de81de7fd4457a6e69bed435212c15404d Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 30 May 2022 20:36:01 +0800
Subject: [PATCH 076/109] support backward inplace in eager fluid dygraph mode
 (#43054)

* support backward inplace in eager fluid mode

* fix

* fix

* optimize format

* little change
---
 .../auto_code_generator/eager_generator.cc    | 159 ++++++++++++++----
 1 file changed, 130 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 3edd13ccd597f..521b952a4dfcd 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -231,6 +231,15 @@ class GradNodeGenerationInfo {
       return &no_need_buffer_ins_;
     }
 
+    const std::unordered_map<std::string, std::string>& GetBackwardInplaceMap()
+        const {
+      return backward_inplace_map_;
+    }
+    std::unordered_map<std::string, std::string>*
+    GetMutableBackwardInplaceMap() {
+      return &backward_inplace_map_;
+    }
+
    private:
     std::string op_base_type_;
     std::map<std::string, std::string> grad_outs_slotname_map_;
@@ -244,6 +253,7 @@ class GradNodeGenerationInfo {
         grad_outs_;
     paddle::framework::AttributeMap grad_attrs_;
     std::unordered_set<std::string> no_need_buffer_ins_;
+    std::unordered_map<std::string, std::string> backward_inplace_map_;
   };
 
  public:
@@ -979,6 +989,12 @@ static bool CollectGradInformationFromOpInfo(
       *(*op_base_infos)[index].GetMutableNoNeedBufferInputs() =
           inferer(g_ins, g_outs, *op_base_grad_attrs);
     }
+
+    auto& infer_backward_inplace = op_base.Info().infer_inplace_;
+    if (infer_backward_inplace) {
+      *(*op_base_infos)[index].GetMutableBackwardInplaceMap() =
+          infer_backward_inplace(true);
+    }
   }
 
   /* ------ Slot Name Matching ---- */
@@ -1005,7 +1021,7 @@ static std::string GenerateGradNodeCreationContent(
     const ForwardGenerationInfo& fwd_info,
     const GradNodeGenerationInfo& bwd_info,
     const std::string& trace_op_body_str,
-    std::map<std::string, std::string> inplace_map = {}) {
+    std::map<std::string, std::string> forward_inplace_map = {}) {
   VLOG(6) << "Generating GradNode Creation codes";
 
   const std::string& op_type = fwd_info.GetOpType();
@@ -1045,8 +1061,10 @@ static std::string GenerateGradNodeCreationContent(
     } else {
       // In inplace op, the case where output is duplicable is not considered.
       // Replace output directly with input in inplace op.
-      if (!inplace_map.empty() && inplace_map.count(output_name)) {
-        auto inplace_input_name = LegalizeVarName(inplace_map[output_name]);
+      if (!forward_inplace_map.empty() &&
+          forward_inplace_map.count(output_name)) {
+        auto inplace_input_name =
+            LegalizeVarName(forward_inplace_map[output_name]);
         const std::string& inplace_input_autograd_name =
             "p_autograd_" + inplace_input_name;
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
@@ -1103,12 +1121,12 @@ static std::string GenerateGradNodeCreationContent(
   // check inplace input to avoid inplace operations on leaf nodes with
   // stop_gradient=False.
   std::string check_inplace_str = "";
-  if (!inplace_map.empty()) {
+  if (!forward_inplace_map.empty()) {
     const char* CHECKING_INPLACE_TEMPLATE =
         "  // Check Inplace\n"
         "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
         "require_any_grad);\n";
-    for (auto& inplace_pair : inplace_map) {
+    for (auto& inplace_pair : forward_inplace_map) {
       std::string inplace_name = LegalizeVarName(inplace_pair.second);
       check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
                                                    inplace_name, inplace_name);
@@ -1161,8 +1179,9 @@ static std::string GenerateGradNodeCreationContent(
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
           "      grad_node->SetTensorWrapper%s(%s);\n";
       // Replace output directly with input in inplace op.
-      if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
-        auto inplace_input_name = inplace_map[tensor_wrapper_name];
+      if (!forward_inplace_map.empty() &&
+          forward_inplace_map.count(tensor_wrapper_name)) {
+        auto inplace_input_name = forward_inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
             LegalizeVarName(inplace_input_name));
@@ -1213,8 +1232,9 @@ static std::string GenerateGradNodeCreationContent(
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
     // Replace output directly with input in inplace op.
-    if (!inplace_map.empty() && inplace_map.count(output_name)) {
-      auto inplace_input_name = inplace_map[output_name];
+    if (!forward_inplace_map.empty() &&
+        forward_inplace_map.count(output_name)) {
+      auto inplace_input_name = forward_inplace_map[output_name];
       const std::string& inplace_input_autograd_name =
           "p_autograd_" + LegalizeVarName(inplace_input_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
@@ -1345,7 +1365,7 @@ static std::string GenerateGradNodeCreationContent(
 static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     const ForwardGenerationInfo& fwd_info,
     const GradNodeGenerationInfo& bwd_info,
-    std::map<std::string, std::string> inplace_map = {}) {
+    std::map<std::string, std::string> forward_inplace_map = {}) {
   /* --- Process Forward Info ---*/
   const std::string& op_type = fwd_info.GetOpType();
   const std::unordered_map<std::string, size_t>& fwd_inputs_name_pos_map =
@@ -1434,8 +1454,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       // inplace tensor can't be const
       const char* FWD_INS_ARG_TEMPLATE;
       bool flag_find_input_name = false;
-      if (!inplace_map.empty()) {
-        for (auto& inplace_pair : inplace_map) {
+      if (!forward_inplace_map.empty()) {
+        for (auto& inplace_pair : forward_inplace_map) {
           if (inplace_pair.second == input_name) {
             flag_find_input_name = true;
             FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s";
@@ -1605,15 +1625,16 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       }
       core_ops_args_info[op_type].push_back(output_name);
 
-    } else if (!inplace_map.empty() && inplace_map.count(output_name)) {
+    } else if (!forward_inplace_map.empty() &&
+               forward_inplace_map.count(output_name)) {
       // In inplace op, replace the output with the input directly.
       PADDLE_ENFORCE_NE(
-          inplace_map[output_name], "",
+          forward_inplace_map[output_name], "",
           paddle::platform::errors::InvalidArgument(
               "Inplace op %s has no input corresponding to output %s.", op_type,
               output_name));
       const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },";
-      auto inplace_input_name = inplace_map[output_name];
+      auto inplace_input_name = forward_inplace_map[output_name];
       outs_contents_str += paddle::string::Sprintf(
           FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name);
 
@@ -1651,7 +1672,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   if (inplace_mapping_str.size() > 0)
     inplace_mapping_str.pop_back();  // Remove trailing ","
 
-  if ((op_type != "cast") && (inplace_map.empty())) {
+  if ((op_type != "cast") && (forward_inplace_map.empty())) {
     VLOG(6) << "Generating Dygraph Forward AMP";
     const char* AMP_LOGIC_CONTEXT =
         "  if (egr::Controller::Instance().GetAMPLevel() != "
@@ -1743,7 +1764,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   VLOG(6) << "Generated Outs Map";
 
   // [Generation] Apply View Strategy (Tensor)
-  if (inplace_map.empty() && view_op_map.count(op_type)) {
+  if (forward_inplace_map.empty() && view_op_map.count(op_type)) {
     const char* HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT =
         "  if (ins.count(\"%s\") && outs.count(\"%s\")) {\n"
         "    egr::EagerUtils::HandleViewBetweenInputAndOutput(ins[\"%s\"][0], "
@@ -1852,10 +1873,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               output_varname, output_var_args_name);
         }
       } else {
-        if (!inplace_map.empty() && inplace_map.count(output_name)) {
+        if (!forward_inplace_map.empty() &&
+            forward_inplace_map.count(output_name)) {
           // Modify meta info of inplace tensor.
           // Bump inplace version of inplace tensor.
-          auto inplace_input_name = inplace_map[output_name];
+          auto inplace_input_name = forward_inplace_map[output_name];
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"
               "  %s.bump_inplace_version();\n"
@@ -1878,10 +1900,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       return_types[return_position] = "paddle::experimental::Tensor";
     }
 
-    if (!inplace_map.empty() && inplace_map.count(output_name)) {
+    if (!forward_inplace_map.empty() &&
+        forward_inplace_map.count(output_name)) {
       // Replace output directly with input in inplace op.
       return_contents[return_position] =
-          LegalizeVarName(inplace_map[output_name]);
+          LegalizeVarName(forward_inplace_map[output_name]);
     } else {
       return_contents[return_position] = output_varname;
     }
@@ -1903,7 +1926,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     // If GradNode needs to be generated, pass `trace_op_body_str`
     // into `GenerateGradNodeCreationContent`.
     std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(
-        fwd_info, bwd_info, trace_op_body_str, inplace_map);
+        fwd_info, bwd_info, trace_op_body_str, forward_inplace_map);
 
     generated_function_body += grad_node_creation_body_str;
     generated_function_body += "\n";
@@ -1960,7 +1983,7 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
   // [Generation] Get Full Function
   std::string function_name;
-  if (inplace_map.empty()) {
+  if (forward_inplace_map.empty()) {
     function_name = op_type + "_dygraph_function";
   } else {
     // change function_name for inplace op.
@@ -2013,6 +2036,7 @@ static std::string GenerateSingleOpBase(
         std::vector<std::shared_ptr<paddle::imperative::VariableWrapper>>>&
         grad_outs,
     const paddle::framework::AttributeMap& grad_attrs,
+    const std::unordered_map<std::string, std::string>& backward_inplace_map,
     bool is_op_base_per_duplicable_input, size_t* outs_size) {
   std::string generated_grad_function_body = "";
 
@@ -2029,6 +2053,23 @@ static std::string GenerateSingleOpBase(
   for (const auto& in : in_vars) {
     if (in.duplicable()) duplicable_input_name_set.insert(in.name());
   }
+  const char* CHECK_BACKWARD_INPLACE_TEMPLATE =
+      "  // Check backward inplace info\n"
+      "  bool %s = false;\n"
+      "  %s\n"
+      "  if (%s.initialized()) {\n"
+      "    VLOG(10) << %s.name() << \"(%s) use_count: \" << "
+      "%s.impl().use_count();\n"
+      "    if (%s.impl().use_count() == 1 || (%s.impl().use_count() == 2 && "
+      "%s.impl().get() == %s.impl().get())) {\n"
+      "      %s = true;\n"
+      "    }\n"
+      "  }\n";
+  const std::string& can_be_inplaced_name =
+      "can_be_inplaced" + std::to_string(*outs_size);
+  const std::string& bwd_inplace_input_name =
+      "backward_inplace_tensor" + std::to_string(*outs_size);
+  bool process_backward_inplace = false;
   std::string ins_contents_str = "";
   for (auto iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
@@ -2051,7 +2092,26 @@ static std::string GenerateSingleOpBase(
       ins_contents_str +=
           paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
                                   grad_input_name, struct_fwd_input_name);
-
+      if (!backward_inplace_map.empty() &&
+          backward_inplace_map.count(grad_input_name)) {
+        process_backward_inplace = true;
+        const char* GRAD_INS_FWD_TENSOR_WRAPPER_TEMPLATE =
+            "auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);";
+        std::string tensor_wrapper_str = paddle::string::Sprintf(
+            GRAD_INS_FWD_TENSOR_WRAPPER_TEMPLATE, bwd_inplace_input_name,
+            struct_fwd_input_name);
+        const char* GRAD_INS_FWD_TENSOR_TEMPLATE =
+            "(&this->%s)->get_intermidiate_tensor()";
+        std::string tensor_wrapper_intermidiate_tensor_str =
+            paddle::string::Sprintf(GRAD_INS_FWD_TENSOR_TEMPLATE,
+                                    struct_fwd_input_name);
+        generated_grad_function_body += paddle::string::Sprintf(
+            CHECK_BACKWARD_INPLACE_TEMPLATE, can_be_inplaced_name,
+            tensor_wrapper_str, bwd_inplace_input_name, bwd_inplace_input_name,
+            grad_input_name, bwd_inplace_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, bwd_inplace_input_name,
+            tensor_wrapper_intermidiate_tensor_str, can_be_inplaced_name);
+      }
     } else if (grad_ins_grad_slotname_map.count(grad_input_name)) {
       // Fwd Tensor's Grad
       size_t fwd_output_position = fwd_outputs_name_pos_map.at(
@@ -2060,7 +2120,24 @@ static std::string GenerateSingleOpBase(
           "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
       ins_contents_str += paddle::string::Sprintf(
           GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
-
+      if (!backward_inplace_map.empty() &&
+          backward_inplace_map.count(grad_input_name)) {
+        process_backward_inplace = true;
+        const char* GRAD_INS_HOOKED_GRAD_TEMPLATE =
+            "auto& %s = hooked_grads[%d][0];";
+        std::string hooked_grads_tensor_str = paddle::string::Sprintf(
+            GRAD_INS_HOOKED_GRAD_TEMPLATE, bwd_inplace_input_name,
+            fwd_output_position);
+        const char* GRAD_INS_GRAD_TENSOR_TEMPLATE = "grads[%d][0]";
+        std::string grads_tensor_str = paddle::string::Sprintf(
+            GRAD_INS_GRAD_TENSOR_TEMPLATE, fwd_output_position);
+        generated_grad_function_body += paddle::string::Sprintf(
+            CHECK_BACKWARD_INPLACE_TEMPLATE, can_be_inplaced_name,
+            hooked_grads_tensor_str, bwd_inplace_input_name,
+            bwd_inplace_input_name, grad_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, bwd_inplace_input_name,
+            bwd_inplace_input_name, grads_tensor_str, can_be_inplaced_name);
+      }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "Detected mismatched slot names."
@@ -2245,6 +2322,27 @@ static std::string GenerateSingleOpBase(
 
   VLOG(6) << "Generated Outs Map";
 
+  // [Generation] Process Backward Inplace
+  if (process_backward_inplace) {
+    const char* HANDLE_BACKWARD_INPLACE_BETWEEN_INPUT_AND_OUTPUT =
+        "  if (%s && %s.count(\"%s\") && %s.count(\"%s\")) {\n"
+        "    egr::EagerUtils::HandleViewBetweenInputAndOutput(%s[\"%s\"][0], "
+        "%s[\"%s\"][0]);\n"
+        "  };\n";
+    std::string backward_inplace_map_str = "";
+    for (auto iter : backward_inplace_map) {
+      std::string backward_inplace_input_name = iter.first;
+      std::string backward_inplace_output_name = iter.second;
+      backward_inplace_map_str += paddle::string::Sprintf(
+          HANDLE_BACKWARD_INPLACE_BETWEEN_INPUT_AND_OUTPUT,
+          can_be_inplaced_name, ins_name, backward_inplace_input_name,
+          outs_name, backward_inplace_output_name, ins_name,
+          backward_inplace_input_name, outs_name, backward_inplace_output_name);
+    }
+    generated_grad_function_body += backward_inplace_map_str;
+    VLOG(6) << "Process Backward Inplace";
+  }
+
   // [Generation] Get Attrs Map
   const char* ATTRS_TEMPLATE = "  auto& %s = this->attr_map_;\n";
   std::string grad_attrs_str =
@@ -2428,13 +2526,15 @@ static std::string GenerateGradNodeCCContents(
     const auto& grad_ins = op_base_info.GetGradIns();
     const auto& grad_outs = op_base_info.GetGradOuts();
     const auto& grad_attrs = op_base_info.GetGradAttrs();
+    const auto& backward_inplace_map = op_base_info.GetBackwardInplaceMap();
 
     const std::string& op_base_type = op_base_info.GetOpBaseType();
     generated_grad_function_body += GenerateSingleOpBase(
         fwd_op_type, op_base_type, fwd_inputs_name_pos_map,
         fwd_outputs_name_pos_map, in_vars, grad_ins_fwd_slotname_map,
         grad_ins_grad_slotname_map, grad_outs_slotname_map, grad_ins, grad_outs,
-        grad_attrs, is_op_base_per_duplicable_input, &outs_size);
+        grad_attrs, backward_inplace_map, is_op_base_per_duplicable_input,
+        &outs_size);
   }
 
   if (is_op_base_per_duplicable_input) {
@@ -2847,19 +2947,20 @@ static void DygraphCodeGeneration(const std::string& output_dir) {
 
     auto& infer_inplace =
         paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_;
-    std::map<std::string, std::string> inplace_map;
+    std::map<std::string, std::string> forward_inplace_map;
     // Inplace Function Generator.
     // `sum` op has duplicate input. Don't consider adding inplace strategy
     // for `sum` in temporary.
     if (infer_inplace && !special_inplace_op_set.count(op_type)) {
       auto in_to_outs = infer_inplace(true);
       for (auto& inplace_pair : in_to_outs) {
-        inplace_map[inplace_pair.second] = inplace_pair.first;
+        forward_inplace_map[inplace_pair.second] = inplace_pair.first;
       }
 
       VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------";
       std::pair<std::string, std::string> inplace_body_and_declaration =
-          GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map);
+          GenerateForwardFunctionContents(fwd_info, bwd_info,
+                                          forward_inplace_map);
 
       fwd_function_str += inplace_body_and_declaration.first + "\n";
 

From 1448520d45d18c7272332f1d10247ab1c287b234 Mon Sep 17 00:00:00 2001
From: shentanyue <34421038+shentanyue@users.noreply.github.com>
Date: Mon, 30 May 2022 21:39:23 +0800
Subject: [PATCH 077/109] [TensorRT] Fix delete fill_constant pass (#43053)

* update lite compile cmake

* Update delete_fill_constant_op_pass.cc

* Update analysis_config.cc
---
 .../ir/delete_fill_constant_op_pass.cc        | 20 ++++++++++++-------
 .../inference/analysis/ir_pass_manager.cc     |  5 +++++
 paddle/fluid/inference/api/analysis_config.cc |  5 -----
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
index e86bb2926b640..79a06572d1427 100644
--- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -30,13 +30,19 @@ void FillConstData(LoDTensor* out_t, T value) {
 void DeleteFillConstantOpPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("delete_fill_constant_op_pass", graph);
   GraphPatternDetector detector;
-  auto fill_constant_op = detector.mutable_pattern()
-                              ->NewNode("fill_constant")
-                              ->assert_is_op("fill_constant")
-                              ->assert_is_not_op_input("ValueTensor")
-                              ->assert_is_not_op_input("str_value")
-                              ->assert_is_not_op_input("ShapeTensor")
-                              ->assert_is_not_op_input("ShapeTensorList");
+  auto fill_constant_op =
+      detector.mutable_pattern()
+          ->NewNode("fill_constant")
+          ->assert_is_op("fill_constant")
+          ->assert_is_not_op_input("ValueTensor")
+          ->assert_is_not_op_input("str_value")
+          ->assert_is_not_op_input("ShapeTensor")
+          ->assert_is_not_op_input("ShapeTensorList")
+          ->assert_more([&](Node* node) {
+            return node->Op()
+                       ->GetAttrIfExists<std::vector<int64_t>>("shape")
+                       .size() == 1;
+          });
   auto fill_constant_out =
       detector.mutable_pattern()
           ->NewNode("fill_constant_out")
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index b2d8afaa7b49c..aafbe57e05ff2 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -273,6 +273,11 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
     if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
       PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     }
+    // delete_fill_constant_op_pass is not apply under trt dynamic shape
+    if (pass->Type() == "delete_fill_constant_op_pass") {
+      bool use_dynamic = pass->Get<bool>("with_dynamic_shape");
+      if (use_dynamic) continue;
+    }
     graph.reset(pass->Apply(graph.release()));
   }
   return graph;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index adc3fc46f72ac..735e1b7be4c1f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -633,11 +633,6 @@ void AnalysisConfig::Update() {
           (pass == "conv_bn_fuse_pass")) {
         continue;
       }
-      // delete_fill_constant_op_pass is not used under trt dynamic shape
-      if ((!min_input_shape_.empty() || trt_tuned_dynamic_shape_) &&
-          pass == "delete_fill_constant_op_pass") {
-        continue;
-      }
       pass_builder()->AppendPass(pass);
     }
   }

From e1e0deed64bd879357b9fc28ff68770f8eae87a6 Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Mon, 30 May 2022 08:48:10 -0500
Subject: [PATCH 078/109] fix scale_matmul fuse pass (#43089)

---
 paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 0fc458723ffe4..60d661f7740d0 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -91,6 +91,10 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, scale_matmul_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, scale_matmul_pattern);
 
+    if ((scale_out->outputs).size() != 1) {
+      return;
+    }
+
     if (scale_op->Op()->GetAttrIfExists<float>("bias") == 0.0) {
       auto matmul_alpha = matmul_op->Op()->GetAttrIfExists<float>("alpha");
       auto scale_scale = scale_op->Op()->GetAttrIfExists<float>("scale");

From dceccd9d1b9ccc8e0f352932401f18864dc49f47 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Mon, 30 May 2022 22:02:21 +0800
Subject: [PATCH 079/109] Add fused_bias_dropout_residual_ln op and layer.
 (#43062)

* add fused_bias_dropout_residual_ln op and layer.
---
 paddle/fluid/operators/fused/CMakeLists.txt   |   2 +
 ...sed_bias_dropout_residual_layer_norm_op.cc | 240 ++++++++++++++++++
 ...sed_bias_dropout_residual_layer_norm_op.cu | 148 +++++++++++
 paddle/fluid/pybind/op_function_generator.h   |   4 +
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 ...sed_bias_dropout_residual_layer_norm_op.py | 151 +++++++++++
 ...bias_dropout_residual_layer_norm_op_api.py | 175 +++++++++++++
 python/paddle/incubate/nn/__init__.py         |   2 +
 .../paddle/incubate/nn/functional/__init__.py |   2 +
 .../nn/functional/fused_transformer.py        | 145 +++++++++++
 .../incubate/nn/layer/fused_transformer.py    |  97 +++++++
 tools/parallel_UT_rule.py                     |   2 +
 12 files changed, 970 insertions(+)
 mode change 100644 => 100755 paddle/fluid/operators/fused/CMakeLists.txt
 create mode 100644 paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
old mode 100644
new mode 100755
index a86d26bcd58a7..e23891d899de6
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -22,6 +22,7 @@ register_operators(EXCLUDES
     fused_transformer_op
     fused_feedforward_op
     fused_multi_transformer_op
+    fused_bias_dropout_residual_layer_norm_op
     resnet_unit_op
     fused_gemm_epilogue_op
     fused_gate_attention_op)
@@ -81,6 +82,7 @@ if (WITH_GPU OR WITH_ROCM)
         # fused_attention_op
         op_library(fused_attention_op)
         op_library(fused_multi_transformer_op)
+        op_library(fused_bias_dropout_residual_layer_norm_op)
     endif()
     # resnet_unit needs cudnn 8.0 above
     if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
new file mode 100644
index 0000000000000..6187544456b37
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -0,0 +1,240 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("LnMean"), "Output", "LnMean",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("LnVariance"), "Output", "LnVariance",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"), "Output",
+                   "BiasDropoutResidualOut", "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"), "Output", "DropoutMaskOut",
+                   "FusedBiasDropoutResidualLnOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y",
+                   "FusedBiasDropoutResidualLnOp");
+    auto x_dim = ctx->GetInputDim("X");
+    int left = 1;
+    for (int i = 0; i < x_dim.size() - 1; i++) {
+      left *= x_dim[i];
+    }
+    ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
+      ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
+    }
+    ctx->SetOutputDim("LnMean", {left});
+    ctx->SetOutputDim("LnVariance", {left});
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = framework::TransToProtoVarType(input->dtype());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+class FusedBiasDropoutResidualLnOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Residual", "The residual tensor.");
+    AddInput("Bias", "The linear bias tensor.").AsDispensable();
+    AddInput("LnScale",
+             "(optional) Scale is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDispensable();
+    AddInput("LnBias",
+             "(optional) Bias is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDispensable();
+    AddOutput("BiasDropoutResidualOut", "Output of bias + dropout + residual.")
+        .AsIntermediate();
+    AddOutput("DropoutMaskOut", "The random sampled dropout mask.")
+        .AsIntermediate();
+    AddOutput("LnMean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("LnVariance", "Variance of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Y", "Result.");
+    AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'dropout_rate' must be between 0.0 and 1.0."));
+        });
+    AddAttr<bool>("dropout_is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<bool>("dropout_fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(true);
+    AddAttr<int>("dropout_seed", "Dropout random seed.").SetDefault(0);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "The meaning is the same as 'attn_dropout_implementation'.")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train", true,
+              platform::errors::InvalidArgument(
+                  "dropout_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<float>("ln_epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &ln_epsilon) {
+          PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f, true,
+                            platform::errors::InvalidArgument(
+                                "'epsilon' of the LayerNorm should be between "
+                                "0.0 and 0.001, But received [%s].",
+                                ln_epsilon));
+        });
+
+    AddComment(R"DOC(
+    Add fused bias_dropout_residual_layer_norm op whose logic is as follows:
+    // @input: [batch_size, seq_len, embed_dim] 
+    // @final_out: [batch_size, seq_len, embed_dim] 
+    y = layer_norm(residual + dropout(bias + x));
+    )DOC");
+  }
+};
+
+class FusedBiasDropoutResidualLnGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->Attrs().Get<bool>("dropout_is_test"), false,
+        platform::errors::InvalidArgument(
+            "GradOp is only callable when dropout_is_test is false"));
+    OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
+                   "FusedBiasDropoutResidualLnGrad");
+    OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
+                   "FusedBiasDropoutResidualLnGrad");
+    OP_INOUT_CHECK(ctx->HasInput("BiasDropoutResidualOut"), "Input",
+                   "BiasDropoutResidualOut", "FusedBiasDropoutResidualLnGrad");
+    if (ctx->HasOutput(framework::GradVarName("LnScale"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnScale"),
+                        ctx->GetInputDim("LnScale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("LnBias"))) {
+      ctx->SetOutputDim(framework::GradVarName("LnBias"),
+                        ctx->GetInputDim("LnBias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Residual"))) {
+      ctx->SetOutputDim(framework::GradVarName("Residual"),
+                        ctx->GetInputDim("Residual"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"),
+                      ctx->GetInputDim("BiasDropoutResidualOut"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input = ctx.Input<Tensor>("X");
+    auto input_data_type = framework::TransToProtoVarType(input->dtype());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FusedBiasDropoutResidualLnGradOpMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_bias_dropout_residual_layer_norm_grad");
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Residual", this->Input("Residual"));
+    if (this->HasInput("Bias")) {
+      op->SetInput("Bias", this->Input("Bias"));
+      op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+    }
+    if (this->HasInput("LnScale")) {
+      op->SetInput("LnScale", this->Input("LnScale"));
+      op->SetOutput(framework::GradVarName("LnScale"),
+                    this->InputGrad("LnScale"));
+    }
+    if (this->HasInput("LnBias")) {
+      op->SetInput("LnBias", this->Input("LnBias"));
+      op->SetOutput(framework::GradVarName("LnBias"),
+                    this->InputGrad("LnBias"));
+    }
+    if (this->HasOutput("LnMean")) {
+      op->SetInput("LnMean", this->Output("LnMean"));
+    }
+    if (this->HasOutput("LnVariance")) {
+      op->SetInput("LnVariance", this->Output("LnVariance"));
+    }
+    if (this->HasOutput("BiasDropoutResidualOut")) {
+      op->SetInput("BiasDropoutResidualOut",
+                   this->Output("BiasDropoutResidualOut"));
+    }
+    op->SetInput("DropoutMaskOut", this->Output("DropoutMaskOut"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Residual"),
+                  this->InputGrad("Residual"));
+    op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"),
+                  this->OutputGrad("BiasDropoutResidualOut"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bias_dropout_residual_layer_norm, ops::FusedBiasDropoutResidualLnOp,
+    ops::FusedBiasDropoutResidualLnOpMaker,
+    ops::FusedBiasDropoutResidualLnGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBiasDropoutResidualLnGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bias_dropout_residual_layer_norm_grad,
+                  ops::FusedBiasDropoutResidualLnGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
new file mode 100644
index 0000000000000..71a2c9728cc6b
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -0,0 +1,148 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda_fp16.h>
+#include <cub/cub.cuh>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    auto *input_x = ctx.Input<Tensor>("X");
+    auto *bias = ctx.Input<Tensor>("Bias");
+    auto *residual = ctx.Input<Tensor>("Residual");
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *ln_bias = ctx.Input<Tensor>("LnBias");
+    auto *dropout_mask_out = ctx.Output<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Output<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Output<Tensor>("LnMean");
+    auto *ln_var = ctx.Output<Tensor>("LnVariance");
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *x_data = input_x->data<T>();
+    auto *bias_data = (bias == nullptr) ? nullptr : bias->data<T>();
+    auto *residual_data = (residual == nullptr) ? nullptr : residual->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data<U>());
+    auto *bias_dropout_residual_out_data =
+        bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *ln_mean_data = ln_mean->mutable_data<U>(ctx.GetPlace());
+    auto *ln_var_data = ln_var->mutable_data<U>(ctx.GetPlace());
+    auto *dropout_mask_out_data =
+        dropout_mask_out->mutable_data<uint8_t>(ctx.GetPlace());
+    auto *y_data = y->mutable_data<T>(ctx.GetPlace());
+
+    const auto input_x_dims = input_x->dims();
+    int bsz_seq = 1;
+    for (int i = 0; i < input_x_dims.size() - 1; i++) {
+      bsz_seq *= input_x_dims[i];
+    }
+    int dim_embed = input_x_dims[input_x_dims.size() - 1];
+    DropoutParam dropout_param(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param,
+        ln_epsilon);
+    // output = layernorm(residual + dropout(input + bias))
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+        ctx.cuda_device_context(), x_data, residual_data, bias_data,
+        ln_scale_data, ln_bias_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, y_data, ln_mean_data, ln_var_data);
+  }
+};
+
+template <typename T>
+class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
+
+    auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *ln_scale = ctx.Input<Tensor>("LnScale");
+    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *bias_dropout_residual_out =
+        ctx.Input<Tensor>("BiasDropoutResidualOut");
+    auto *ln_mean = ctx.Input<Tensor>("LnMean");
+    auto *ln_var = ctx.Input<Tensor>("LnVariance");
+    auto *d_y_data = d_y->data<T>();
+    auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data<U>());
+    auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
+    auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data<T>();
+    auto *ln_mean_data = ln_mean->data<U>();
+    auto *ln_var_data = ln_var->data<U>();
+
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_residual = ctx.Output<Tensor>(framework::GradVarName("Residual"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto *d_bias_dropout_residual_out =
+        ctx.Output<Tensor>(framework::GradVarName("BiasDropoutResidualOut"));
+    auto *d_ln_scale = ctx.Output<Tensor>(framework::GradVarName("LnScale"));
+    auto *d_ln_bias = ctx.Output<Tensor>(framework::GradVarName("LnBias"));
+    auto *d_x_data = d_x->mutable_data<T>(ctx.GetPlace());
+    auto *d_residual_data = d_residual->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_dropout_residual_out_data =
+        d_bias_dropout_residual_out->mutable_data<T>(ctx.GetPlace());
+    auto *d_bias_data =
+        (d_bias == nullptr ? nullptr : d_bias->mutable_data<T>(ctx.GetPlace()));
+    auto *d_ln_scale_data =
+        (d_ln_scale == nullptr ? nullptr
+                               : d_ln_scale->mutable_data<U>(ctx.GetPlace()));
+    auto *d_ln_bias_data =
+        (d_ln_bias == nullptr ? nullptr
+                              : d_ln_bias->mutable_data<U>(ctx.GetPlace()));
+
+    const auto input_x_dims = d_y->dims();
+    int bsz_seq = 1;
+    for (int i = 0; i < input_x_dims.size() - 1; i++) {
+      bsz_seq *= input_x_dims[i];
+    }
+    int dim_embed = input_x_dims[input_x_dims.size() - 1];
+    DropoutParam dropout_param(ctx, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t> fused_dropout_layernorm_helper(
+        ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param,
+        ln_epsilon);
+    fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
+        ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data,
+        dropout_mask_out_data, ln_scale_data, ln_mean_data, ln_var_data,
+        d_bias_dropout_residual_out_data, d_ln_scale_data, d_ln_bias_data,
+        d_x_data, d_bias_data, d_residual_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fused_bias_dropout_residual_layer_norm,
+                        ops::FusedBiasDropoutResidualLnOpKernel<float>,
+                        ops::FusedBiasDropoutResidualLnOpKernel<double>,
+                        ops::FusedBiasDropoutResidualLnOpKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    fused_bias_dropout_residual_layer_norm_grad,
+    ops::FusedBiasDropoutResidualLnGradKernel<float>,
+    ops::FusedBiasDropoutResidualLnGradKernel<double>,
+    ops::FusedBiasDropoutResidualLnGradKernel<plat::float16>);
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index df32f65a794f3..bc84863d7d607 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -40,6 +40,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"X", "LnScale", "LnBias", "QKVW", "QKVBias", "CacheKV", "TimeStep",
       "SrcMask", "OutLinearW", "OutLinearBias", "FFNLnScale", "FFNLnBias",
       "FFN1Weight", "FFN1Bias", "FFN2Weight", "FFN2Bias"}},
+    {"fused_bias_dropout_residual_layer_norm",
+     {"X", "Residual", "Bias", "LnScale", "LnBias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
@@ -152,6 +154,8 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
                          "DropoutMaskOut", "Ln2Mean",
                          "Ln2Variance",    "BiasDropoutResidualOut",
                          "CacheKVOut",     "Y"}},
+    {"fused_bias_dropout_residual_layer_norm",
+     {"BiasDropoutResidualOut", "DropoutMaskOut", "LnMean", "LnVariance", "Y"}},
     {"fused_gate_attention",
      {"QueryTransposeOut", "KeyTransposeOut", "ValueTransposeOut",
       "QKVTransposeOut", "SoftmaxOut", "FMHAOut", "GateOut", "Out"}},
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e0cd0c4bf4d41..34237d47a5659 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -131,6 +131,8 @@ if(NOT WITH_GPU)
     LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
     LIST(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
     LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
+    LIST(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
 endif()
 
 LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
new file mode 100644
index 0000000000000..d47450837a455
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
+from paddle.nn.layer.norm import LayerNorm
+from paddle.nn.layer.common import Linear, Dropout
+from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle import tensor
+from paddle.fluid import layers
+import unittest
+from op_test import OpTest
+from paddle.fluid.framework import default_main_program
+
+default_main_program().random_seed = 42
+
+
+class TestFusedBiasDropoutResidualLayerNormOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.generate_input_data()
+        paddle.set_default_dtype(self.x_type)
+        self.__class__.op_type = "fused_bias_dropout_residual_layer_norm"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+        paddle.set_default_dtype(np.float32)
+        self.norm1 = LayerNorm(self.embed_dim)
+        paddle.set_default_dtype(self.x_type)
+        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
+
+    def config(self):
+        self.x_type = np.float32
+        self.atol = 1e-4
+        self.training = True
+        self.batch_size = 8
+        self.query_length = 128
+        self.embed_dim = 1024
+        self.dropout_prob = 0.0
+        self.weight_attr = None
+        self.bias_attr = None
+
+    def generate_input_data(self):
+        self.x = np.random.rand(self.batch_size, self.query_length,
+                                self.embed_dim).astype(self.x_type)
+        self.residual = np.random.rand(self.batch_size, self.query_length,
+                                       self.embed_dim).astype(self.x_type)
+        self.linear_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+        self.dout = np.random.random((self.batch_size, self.query_length,
+                                      self.embed_dim)).astype(self.x_type)
+
+        if self.bias_attr is False:
+            self.tensor_linear_bias = None
+        else:
+            self.tensor_linear_bias = paddle.to_tensor(
+                self.linear_bias, stop_gradient=False)
+
+        self.tensor_x = paddle.to_tensor(self.x, stop_gradient=False)
+        self.tensor_residual = paddle.to_tensor(
+            self.residual, stop_gradient=False)
+
+    def GetBaselineOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        if self.tensor_linear_bias is not None:
+            out = self.tensor_x + self.tensor_linear_bias
+        else:
+            out = self.tensor_x
+
+        residual_out = self.tensor_residual + self.dropout(out)
+        final_out = self.norm1(residual_out)
+
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+
+        if self.tensor_linear_bias is not None:
+            tensor_linear_bias_grad = self.tensor_linear_bias.grad
+        else:
+            tensor_linear_bias_grad = None
+        return final_out, self.tensor_x.grad, self.tensor_residual.grad, tensor_linear_bias_grad
+
+    def GetFusedBiasDropoutResidualLayerNormOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        ln_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False)
+        ln_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False)
+        epsilon = 1e-05
+
+        final_out = incubate_f.fused_bias_dropout_residual_layer_norm(
+            self.tensor_x, self.tensor_residual, self.tensor_linear_bias,
+            ln_scale, ln_bias, self.dropout_prob, epsilon)
+
+        paddle.autograd.backward(
+            [final_out], [paddle.to_tensor(self.dout)], retain_graph=True)
+        if self.tensor_linear_bias is not None:
+            tensor_linear_bias_grad = self.tensor_linear_bias.grad
+        else:
+            tensor_linear_bias_grad = None
+        return final_out, self.tensor_x.grad, self.tensor_residual.grad, tensor_linear_bias_grad
+
+    def test_fused_op(self):
+        out_ref, x_grad_ref, residual_grad_ref, linear_bias_grad_ref = self.GetBaselineOut(
+        )
+        out, x_grad, residual_grad, linear_bias_grad = self.GetFusedBiasDropoutResidualLayerNormOut(
+        )
+        np.testing.assert_allclose(
+            out_ref, out.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(
+            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=self.atol)
+        np.testing.assert_allclose(
+            residual_grad_ref, residual_grad.numpy(), rtol=1e-5, atol=self.atol)
+        if linear_bias_grad_ref is not None:
+            np.testing.assert_allclose(
+                linear_bias_grad_ref,
+                linear_bias_grad.numpy(),
+                rtol=1e-5,
+                atol=self.atol)
+
+
+class TestFusedBiasDropoutResidualLayerNormOpBiasIsNone(
+        TestFusedBiasDropoutResidualLayerNormOp):
+    def config(self):
+        super().config()
+        self.bias_attr = False
+
+
+class TestFusedBiasDropoutResidualLayerNormOpFp16(
+        TestFusedBiasDropoutResidualLayerNormOp):
+    def config(self):
+        super().config()
+        self.x_type = np.float16
+        self.atol = 1e-1
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
new file mode 100644
index 0000000000000..19fc3972e58d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op_api.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+from paddle.incubate.nn.layer.fused_transformer import FusedBiasDropoutResidualLayerNorm
+from paddle import tensor
+from paddle.fluid import layers
+from paddle.static import Program, program_guard
+import unittest
+
+
+def layer_norm(x, has_scale, has_bias, weight, bias, epsilon=1e-05):
+    batch_size, src_len, d_model = x.shape
+    x = x.reshape((batch_size * src_len, d_model))
+    mu = np.mean(x, axis=1, keepdims=True)
+    sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+    x1_up = (x - mu)
+    x1_down_1 = sigma_squar + epsilon
+    x1_down = np.sqrt(x1_down_1)
+    x1_down = x1_down.reshape((x1_down.shape[0], 1))
+    x1 = x1_up / x1_down
+    x_scaled = x1
+    if (has_scale):
+        x_scaled = weight * x1
+    x_scaled_bias = x_scaled
+    if (has_bias):
+        x_scaled_bias = x_scaled + bias
+    x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def compute_reference(x, residual, ln_scale, ln_bias, linear_bias):
+    batch_size = x.shape[0]
+    seq_len = x.shape[1]
+    embed_dim = x.shape[2]
+
+    has_bias = True
+    if ln_bias is None:
+        has_bias = False
+    # bias add, dropout, residual add, layer_norm.
+    if linear_bias is not None:
+        linear_bias_out = x + linear_bias
+    else:
+        linear_bias_out = x
+    linear_bias_dropout_out = linear_bias_out
+    linear_bias_dropout_residual_out = residual + linear_bias_dropout_out
+    linear_bias_dropout_residual_ln_out = layer_norm(
+        linear_bias_dropout_residual_out, True, has_bias, ln_scale, ln_bias)
+    return linear_bias_dropout_residual_ln_out
+
+
+class TestFusedBiasDropoutResidualLayerNormAPI(unittest.TestCase):
+    def setUp(self):
+        self.setXType()
+        self.setBiasAttr()
+        self.config()
+        self.generate_input_data()
+
+    def setBiasAttr(self):
+        self.bias_attr = None
+
+    def setXType(self):
+        self.x_type = np.float32
+        self.atol = 1e-4
+
+    def config(self):
+        self.training = True
+        self.batch_size = 1
+        self.query_length = 2
+        self.embed_dim = 4
+        self.dropout_prob = 0.0
+        self.weight_attr = None
+
+    def generate_input_data(self):
+        self.x = np.random.rand(self.batch_size, self.query_length,
+                                self.embed_dim).astype(self.x_type)
+        self.residual = np.random.rand(self.batch_size, self.query_length,
+                                       self.embed_dim).astype(self.x_type)
+
+    def run_imperative(self):
+        fused_bias_dropout_residual_ln = FusedBiasDropoutResidualLayerNorm(
+            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+
+        linear_bias = None
+        if self.bias_attr is not False:
+            linear_bias = np.random.random(fused_bias_dropout_residual_ln.
+                                           linear_bias.shape).astype('float32')
+            fused_bias_dropout_residual_ln.linear_bias.set_value(
+                paddle.to_tensor(linear_bias))
+        out = fused_bias_dropout_residual_ln(
+            paddle.to_tensor(self.x), paddle.to_tensor(self.residual))
+
+        ln_bias = None
+        if self.bias_attr is not False:
+            ln_bias = fused_bias_dropout_residual_ln.ln_bias.numpy()
+        ln_scale = fused_bias_dropout_residual_ln.ln_scale.numpy(),
+        ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
+                                    linear_bias)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=1e-5, atol=self.atol)
+
+    def run_static(self):
+        fused_op = FusedBiasDropoutResidualLayerNorm(
+            self.embed_dim, self.dropout_prob, self.weight_attr, self.bias_attr)
+
+        x = paddle.static.data(
+            name='X',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        residual = paddle.static.data(
+            name='Residual',
+            shape=[self.batch_size, self.query_length, self.embed_dim],
+            dtype=self.x_type)
+        final_out = fused_op(x, residual)
+
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        linear_bias = None
+        ln_bias = None
+        if self.bias_attr is False:
+            out, ln_scale = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.x,
+                      "Residual": self.residual},
+                fetch_list=[final_out, fused_op.ln_scale])
+        else:
+            out, linear_bias, ln_scale, ln_bias = exe.run(
+                paddle.static.default_main_program(),
+                feed={"X": self.x,
+                      "Residual": self.residual},
+                fetch_list=[
+                    final_out, fused_op.linear_bias, fused_op.ln_scale,
+                    fused_op.ln_bias
+                ])
+        return out, linear_bias, ln_scale, ln_bias
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(Program()):
+            out, linear_bias, ln_scale, ln_bias = self.run_static()
+        ref_out = compute_reference(self.x, self.residual, ln_scale, ln_bias,
+                                    linear_bias)
+        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=self.atol)
+
+    def test_dynamic_api(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+
+
+class TestFusedBiasDropoutResidualLayerNormAPIBiasIsNone(
+        TestFusedBiasDropoutResidualLayerNormAPI):
+    def setBiasAttr(self):
+        self.bias_attr = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/incubate/nn/__init__.py b/python/paddle/incubate/nn/__init__.py
index 43fcabf97317e..3c806aa646ebe 100644
--- a/python/paddle/incubate/nn/__init__.py
+++ b/python/paddle/incubate/nn/__init__.py
@@ -16,10 +16,12 @@
 from .layer.fused_transformer import FusedFeedForward  # noqa: F401
 from .layer.fused_transformer import FusedTransformerEncoderLayer  # noqa: F401
 from .layer.fused_transformer import FusedMultiTransformer  # noqa: F401
+from .layer.fused_transformer import FusedBiasDropoutResidualLayerNorm  # noqa: F401
 
 __all__ = [  #noqa
     'FusedMultiHeadAttention',
     'FusedFeedForward',
     'FusedTransformerEncoderLayer',
     'FusedMultiTransformer',
+    'FusedBiasDropoutResidualLayerNorm',
 ]
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
index 4da090487785b..02e44548ce5d8 100644
--- a/python/paddle/incubate/nn/functional/__init__.py
+++ b/python/paddle/incubate/nn/functional/__init__.py
@@ -15,9 +15,11 @@
 from .fused_transformer import fused_multi_head_attention
 from .fused_transformer import fused_feedforward
 from .fused_transformer import fused_multi_transformer
+from .fused_transformer import fused_bias_dropout_residual_layer_norm
 
 __all__ = [
     'fused_multi_head_attention',
     'fused_feedforward',
     'fused_multi_transformer',
+    'fused_bias_dropout_residual_layer_norm',
 ]
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index 3e263f1c6d3ae..ee85642d41664 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -212,6 +212,151 @@ def fused_feedforward(x,
     return out
 
 
+def fused_bias_dropout_residual_layer_norm(x,
+                                           residual,
+                                           bias=None,
+                                           ln_scale=None,
+                                           ln_bias=None,
+                                           dropout_rate=0.5,
+                                           ln_epsilon=1e-5,
+                                           training=True,
+                                           mode='upscale_in_train',
+                                           name=None):
+    r"""
+    The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
+
+    .. code-block:: python
+        y = layer_norm(residual + dropout(bias + x))
+
+    Parameters:
+        x (Tensor): The input tensor. The shape is `[*, embed\_dim]`.
+        residual (Tensor): The residual tensor. The shape is same as x.
+        bias (Tensor, optional): The bias of linear. The shape is `[embed_dim]`. Default None.
+        ln_scale (Tensor, optional): The weight tensor of layernorm. The shape is `[embed_dim]`. Default None.
+        ln_bias (Tensor, optional): The bias tensor of layernorm. The shape is `[embed_dim]`. Default None.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        ln_epsilon (float, optional): Small float value added to denominator of layer_norm
+            to avoid dividing by zero. Default is 1e-5.
+        training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
+        mode (str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
+
+                               1. upscale_in_train(default), upscale the output at training time
+
+                                  - train: out = input * mask / ( 1.0 - p )
+                                  - inference: out = input
+
+                               2. downscale_in_infer, downscale the output at inference
+
+                                  - train: out = input * mask
+                                  - inference: out = input * (1.0 - p)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The output Tensor, the data type and shape is same as `x`.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            import paddle.incubate.nn.functional as F
+
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # residual: [batch_size, seq_len, embed_dim]
+            residual = paddle.rand(shape=(2, 4, 128), dtype="float32")
+            # linear bias: [embed_dim]
+            bias = paddle.rand(shape=[128], dtype="float32")
+            # output: [batch_size, seq_len, embed_dim]
+            output = F.fused_bias_dropout_residual_layer_norm(
+                x, residual, bias)
+            # [2, 4, 128]
+            print(output.shape)
+    """
+    seed = None
+    if mode not in ('downscale_in_infer', 'upscale_in_train'):
+        raise ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    if ln_scale is not None:
+        assert len(ln_scale.
+                   shape) == 1, "The dims of the shape of ln_scale should be 1."
+        assert x.shape[len(x.shape) - 1] == ln_scale.shape[
+            0], "The dim of ln_scale must equal to the last dim of x."
+    if ln_bias is not None:
+        assert len(
+            ln_bias.shape) == 1, "The dims of the shape of ln_bias should be 1."
+        assert x.shape[len(x.shape) - 1] == ln_bias.shape[
+            0], "The dim of ln_bias must equal to the last dim of x."
+
+    if _non_static_mode():
+        if default_main_program().random_seed != 0:
+            seed = default_main_program().random_seed
+        _, _, _, _, final_out = _C_ops.fused_bias_dropout_residual_layer_norm(
+            x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
+            'ln_epsilon', ln_epsilon, 'dropout_is_test', not training,
+            'dropout_fix_seed', seed is not None, 'dropout_seed', seed
+            if seed is not None else 0, 'dropout_implementation', mode)
+        return final_out
+    else:
+        helper = LayerHelper('fused_bias_dropout_residual_layer_norm',
+                             **locals())
+        dtype = x.dtype
+        # check dtypes
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'fused_bias_dropout_residual_layer_norm')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'fused_bias_dropout_residual_layer_norm')
+        # set inputs
+        inputs = dict()
+        inputs['X'] = [x]
+        inputs['Residual'] = [residual]
+        if bias is not None:
+            inputs['Bias'] = [bias]
+        if ln_scale:
+            inputs['LnScale'] = [ln_scale]
+        if ln_bias:
+            inputs['LnBias'] = [ln_bias]
+        if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
+            seed = helper.main_program.random_seed
+        # set attrs
+        attrs = {
+            'ln_epsilon': ln_epsilon,
+            'dropout_rate': dropout_rate,
+            'dropout_is_test': not training,
+            'dropout_fix_seed': seed is not None,
+            'dropout_seed': seed if seed is not None else 0,
+            'dropout_implementation': mode,
+        }
+        # set outputs
+        dropout_mask_out = helper.create_variable_for_type_inference(
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        ln_mean_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        ln_variance_out = helper.create_variable_for_type_inference(
+            dtype=dtype, stop_gradient=True)
+        bias_dropout_residual_out = helper.create_variable_for_type_inference(
+            dtype=dtype)
+        final_out = helper.create_variable_for_type_inference(dtype=dtype)
+
+        helper.append_op(
+            type='fused_bias_dropout_residual_layer_norm',
+            inputs=inputs,
+            outputs={
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "LnMean": ln_mean_out,
+                "LnVariance": ln_variance_out,
+                'Y': final_out,
+            },
+            attrs=attrs)
+        return final_out
+
+
 def fused_multi_head_attention(x,
                                qkv_weight,
                                linear_weight,
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index 072c7d9fccade..a64b7e506021c 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -36,6 +36,103 @@ def _set_var_distributed(var):
     main_block._find_var_recursive(var.name).is_distributed = True
 
 
+class FusedBiasDropoutResidualLayerNorm(Layer):
+    """
+    Applies fused_bias_dropout_residual_layer_norm operation.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        dropout_rate (float, optional): The dropout probability used on attention
+            weights to drop some attention targets for the dropout after attention.
+            0 for no dropout. Default 0.5.
+        bias_attr (ParamAttr|bool, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr`.
+        epsilon (float, optional): The small value added to the variance to prevent
+            division by zero. Default: 1e-05.
+
+    Examples:
+
+        .. code-block:: python
+
+            # required: gpu
+            import paddle
+            # input: [batch_size, seq_len, embed_dim]
+            x = paddle.rand((2, 4, 128))
+            # residual: [batch_size, seq_len, embed_dim]
+            residual = paddle.rand((2, 4, 128))
+            fused_bias_dropout_residual_ln = paddle.incubate.nn.FusedBiasDropoutResidualLayerNorm(128)
+            output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 dropout_rate=0.5,
+                 weight_attr=None,
+                 bias_attr=None,
+                 epsilon=1e-5,
+                 name=None):
+        super(FusedBiasDropoutResidualLayerNorm, self).__init__()
+        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
+                               "but recieved {}".format(embed_dim))
+        self._dtype = self._helper.get_default_dtype()
+        self._bias_attr = bias_attr
+        self._weight_attr = weight_attr
+        self.embed_dim = embed_dim
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.ln_scale = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[embed_dim],
+            default_initializer=Constant(value=1.0))
+        self.ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True)
+        self.dropout_rate = dropout_rate
+        self._epsilon = epsilon
+
+        self.name = name
+
+    def forward(self, x, residual):
+        """
+        Applies fused_bias_dropout_residual_layer_norm operation.
+
+        Parameters:
+            x (Tensor): The input tensor. It is a tensor with shape 
+                `[batch_size, seq_len, embed_dim]`. The data type should be 
+                float32 or float64. 
+            residual (Tensor, optional): The residual tensor. It is a tensor 
+                with shape `[batch_size, value_length, vdim]`. The data type 
+                should be float32 or float64. 
+
+        Returns:
+            Tensor|tuple: It is a tensor that has the same shape and data type \
+                as `x`.
+        """
+
+        out = incubate_f.fused_bias_dropout_residual_layer_norm(
+            x=x,
+            residual=residual,
+            bias=self.linear_bias,
+            ln_scale=self.ln_scale,
+            ln_bias=self.ln_bias,
+            dropout_rate=self.dropout_rate,
+            ln_epsilon=self._epsilon,
+            training=self.training,
+            mode='upscale_in_train',
+            name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
+            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
+            self._dtype, name_str)
+
+
 class FusedMultiHeadAttention(Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 7702e8be9c958..7c43ef1a6d2e3 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -2047,6 +2047,8 @@
     'test_lambda',
     'test_prod_op',
     'test_fused_attention_op_api',
+    'test_fused_bias_dropout_residual_layer_norm_op',
+    'test_fused_bias_dropout_residual_layer_norm_op_api',
     'test_complex_grad_accumulated',
     'test_deg2rad',
     'test_lgamma_op',

From 13a21cf7a45f4b740b010b57b309fee5357ff32b Mon Sep 17 00:00:00 2001
From: Chenxiao Niu <ncx_bupt@163.com>
Date: Mon, 30 May 2022 22:36:02 +0800
Subject: [PATCH 080/109] [mlu] add one_hot_v2 mlu kernel (#43025)

---
 paddle/fluid/operators/one_hot_v2_op_mlu.cc   |  86 +++++++
 .../unittests/mlu/test_one_hot_v2_op_mlu.py   | 235 ++++++++++++++++++
 2 files changed, 321 insertions(+)
 create mode 100644 paddle/fluid/operators/one_hot_v2_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py

diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
new file mode 100644
index 0000000000000..855cdda963cb6
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -0,0 +1,86 @@
+
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class OneHotV2MLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    int depth = ctx.Attr<int>("depth");
+    if (ctx.HasInput("depth_tensor")) {
+      std::vector<int32_t> depth_data;
+      depth_data = GetDataFromTensor<int>(ctx.Input<Tensor>("depth_tensor"));
+      depth = depth_data[0];
+
+      auto out_dims = out->dims();
+      out_dims[out_dims.size() - 1] = depth;
+      out->Resize(out_dims);
+    }
+    out->mutable_data<float>(ctx.GetPlace());
+
+    float on_value = 1.0f, off_value = 0.0f;
+    const int in_off_dim[1] = {1};
+    Tensor on_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+        framework::DDim(in_off_dim, 1), dev_ctx);
+    Tensor off_value_tensor = ctx.AllocateTmpTensor<float, MLUDeviceContext>(
+        framework::DDim(in_off_dim, 1), dev_ctx);
+    FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
+    FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
+
+    if (framework::TransToProtoVarType(in->dtype()) ==
+        framework::proto::VarType::INT32) {
+      MLUCnnlTensorDesc desc_indices(*in);
+      MLUCnnl::OneHot(ctx, desc_indices.get(), GetBasePtr(in), depth,
+                      GetBasePtr(&on_value_tensor),
+                      GetBasePtr(&off_value_tensor), -1,
+                      ToCnnlDataType(out->dtype()), GetBasePtr(out));
+    } else {
+      Tensor transformed_in;
+      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
+      // use cnnlCast to cast int64_t to int32_t then do one_hot
+      MLUCnnlTensorDesc in_desc(*in);
+      MLUCnnlTensorDesc transformed_in_desc(transformed_in);
+      cnnlCastDataType_t cast_type = GetCastDataType(
+          framework::TransToProtoVarType(in->dtype()),
+          framework::TransToProtoVarType(transformed_in.dtype()));
+      MLUCnnl::Cast(ctx, cast_type, in_desc.get(), GetBasePtr(in),
+                    transformed_in_desc.get(), GetBasePtr(&transformed_in));
+      MLUCnnl::OneHot(
+          ctx, transformed_in_desc.get(), GetBasePtr(&transformed_in), depth,
+          GetBasePtr(&on_value_tensor), GetBasePtr(&off_value_tensor), -1,
+          ToCnnlDataType(out->dtype()), GetBasePtr(out));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
new file mode 100644
index 0000000000000..a56e9ff7558f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_one_hot_v2_op_mlu.py
@@ -0,0 +1,235 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.framework import Program, program_guard, _test_eager_guard
+
+paddle.enable_static()
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_attr(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0])])
+
+        out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]), 1,
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, 0, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestOneHotOp_exception(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.op_type = 'one_hot_v2'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
+        self.x.set(data, self.place)
+        self.x.set_recursive_sequence_lengths(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(ValueError, run)
+
+
+class TestOneHotOpApi(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_api(self):
+        depth = 10
+        self._run(depth)
+
+    def test_api_with_depthTensor(self):
+        depth = fluid.layers.assign(input=np.array([10], dtype=np.int32))
+        self._run(depth)
+
+    def test_api_with_dygraph(self):
+        depth = 10
+        label = np.array([np.random.randint(0, depth - 1)
+                          for i in range(6)]).reshape([6, 1])
+        with fluid.dygraph.guard():
+            one_hot_label = fluid.one_hot(
+                input=fluid.dygraph.to_variable(label), depth=depth)
+
+            one_hot_label = paddle.nn.functional.one_hot(
+                fluid.dygraph.to_variable(label), depth)
+            # with _test_eager_guard():
+            #     one_hot_label = paddle.nn.functional.one_hot(
+            #         paddle.to_tensor(label), depth)
+
+    def _run(self, depth):
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        one_hot_label = fluid.one_hot(input=label, depth=depth)
+
+        label_data = np.array([np.random.randint(0, 10 - 1)
+                               for i in range(6)]).reshape([6, 1])
+
+        exe = fluid.Executor(self.place)
+        exe.run(fluid.default_startup_program())
+        ret = exe.run(feed={'label': label_data, },
+                      fetch_list=[one_hot_label],
+                      return_numpy=False)
+
+
+class BadInputTestOnehotV2(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def test_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def test_bad_x():
+                label = fluid.layers.data(
+                    name="label",
+                    shape=[4],
+                    append_batch_size=False,
+                    dtype="float32")
+                one_hot_label = fluid.one_hot(input=label, depth=4)
+
+            self.assertRaises(TypeError, test_bad_x)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 12d8a567b5bfecd284ff856f7471699ed3da0af7 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Mon, 30 May 2022 19:25:19 +0200
Subject: [PATCH 081/109] OneDNN md-in-tensor refactoring part 5: Memory
 descriptor enabled for elementwises, reductions and expand_v2 ops (#43036)

* enabled md in elementwises, reductions and expand_v2

* CI fix for invalid numpy copy

* fixed formatting

* CI rerun

* changes after review
---
 .../mkldnn/elementwise_mkldnn_op.h            | 19 +++---
 .../operators/mkldnn/expand_v2_mkldnn_op.cc   | 21 +++---
 .../reduce_ops/mkldnn/reduce_mkldnn_op.h      | 65 ++++++++----------
 paddle/fluid/platform/mkldnn_reuse.h          | 68 +++++++------------
 .../unittests/mkldnn/test_reduce_mkldnn_op.py | 13 +++-
 5 files changed, 82 insertions(+), 104 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index d1a1aa3008c8b..070bf9511a9fe 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -145,8 +145,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     binary_prim->execute(astream, args);
     astream.wait();
 
-    z->set_layout(DataLayout::kMKLDNN);
-    z->set_format(platform::GetMKLDNNFormat(*dst_memory));
+    z->set_mem_desc(dst_memory->get_desc());
   }
 };
 
@@ -179,7 +178,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        dout->format(), platform::to_void_cast(dout->data<T>()));
+        dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -189,7 +188,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       // elementwise_add & elementwise_sub
       if (BINARY_OP == dnnl::algorithm::binary_add ||
           BINARY_OP == dnnl::algorithm::binary_sub) {
-        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(),
+        dst_memory = reorder_handler.AcquireDstMemory(dx, dout->mem_desc(),
                                                       ctx.GetPlace());
         auto reorder_p =
             reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
@@ -218,8 +217,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
       }
       astream.wait();
 
-      dx->set_layout(framework::DataLayout::kMKLDNN);
-      dx->set_format(platform::GetMKLDNNFormat(*dst_memory));
+      dx->set_mem_desc(dst_memory->get_desc());
     }
 
     if (dy) {
@@ -232,7 +230,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
           BINARY_OP == dnnl::algorithm::binary_sub) {
         if (dout->dims() == dy->dims()) {
           auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-              dy, dout->format(), ctx.GetPlace());
+              dy, dout->mem_desc(), ctx.GetPlace());
 
           dnnl::primitive_attr reorder_attr;
           std::vector<float> scales(1);
@@ -301,7 +299,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         dst_memory = dst_dy_memory;
       }
       astream.wait();
-      dy->set_layout(DataLayout::kMKLDNN);
 
       if (dout->dims() != dy->dims()) {
         // Broadcasting
@@ -324,10 +321,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                           {DNNL_ARG_DST, *dst_memory},
                                       });
         astream.wait();
-        dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape(
-            phi::vectorize<int64_t>(dy->dims()))));
+        dy->set_mem_desc(dst_memory->get_desc().reshape(
+            phi::vectorize<int64_t>(dy->dims())));
       } else {
-        dy->set_format(platform::GetMKLDNNFormat(*dst_memory));
+        dy->set_mem_desc(dst_memory->get_desc());
       }
     }
   }
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index 05d6bae5f719a..91dccbee0aef2 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -45,19 +45,17 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
       out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
     }
 
-    dnnl::memory::desc x_mem_desc = x->mem_desc();
     if (x_vec_dims.size() != out_new_dims.size()) {
-      x_mem_desc = GetExtendedMemoryDescriptor(x_mem_desc, x_vec_dims,
-                                               out_new_dims.size());
+      x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
     }
 
     out->Resize(phi::make_ddim(out_new_dims));
     paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
-        dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), out, x,
-        0.0f, 1.0f, x_mem_desc);
+        dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), x, out,
+        0.0f, 1.0f, x_vec_dims);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);  // acquires zeroed mem
+    auto dst_memory_p = handler.AcquireZeroedDstMemory(out);
     auto binary_p = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
@@ -73,14 +71,13 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
   }
 
  private:
-  dnnl::memory::desc GetExtendedMemoryDescriptor(
-      const dnnl::memory::desc& x_mem_desc,
-      const std::vector<int64_t>& x_vec_dims, int new_size) const {
-    std::vector<int64_t> new_dims(new_size, 1);
+  std::vector<int64_t> GetExtendedXDims(const std::vector<int64_t>& x_vec_dims,
+                                        int new_size) const {
+    std::vector<int64_t> extended_x_dims(new_size, 1);
     std::copy(x_vec_dims.begin(), x_vec_dims.end(),
-              new_dims.begin() + new_size - x_vec_dims.size());
+              extended_x_dims.begin() + new_size - x_vec_dims.size());
 
-    return x_mem_desc.reshape(new_dims);
+    return extended_x_dims;
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 0c174b0825c9f..94d8cc41d3f31 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -29,11 +29,11 @@ inline std::vector<int64_t> CalculateReducedDims(
     bool reduce_all, bool keep_dim) {
   if (keep_dim) return phi::vectorize(output->dims());
 
-  if (reduce_all)
-    return std::vector<int64_t>(phi::vectorize(input->dims()).size(), 1);
+  if (reduce_all) return std::vector<int64_t>(input->dims().size(), 1);
 
   std::vector<int64_t> output_dims(phi::vectorize(input->dims()));
   for (size_t i = 0; i < reduce_dims.size(); ++i) {
+    // handle negative dims, f.e. "-1" means rightmost dimension
     reduce_dims[i] = (reduce_dims[i] >= 0)
                          ? reduce_dims[i]
                          : input->dims().size() + reduce_dims[i];
@@ -52,16 +52,16 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& onednn_engine = dev_ctx.GetEngine();
 
-    const auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
+    const auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
 
     auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
 
-    auto output_dims =
-        CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim);
-    auto input_dims = phi::vectorize(input->dims());
+    auto x_tz = phi::vectorize(x->dims());
+    auto out_tz =
+        CalculateReducedDims(x, out, reduce_dims, reduce_all, keep_dim);
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
 
@@ -69,18 +69,19 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
     // copied without actual reduction.
     // In that case reorder must be executed to maintain compatibility with
     // PaddlePaddle reduce op
-    if (input_dims == output_dims) {
-      dnnl::memory::data_type input_type = framework::ToMKLDNNDataType(
-          framework::TransToProtoVarType(input->dtype()));
+    if (x_tz == out_tz) {
+      dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(
+          framework::TransToProtoVarType(x->dtype()));
       platform::ReorderMKLDNNHandler reorder_handler(
-          input_dims, framework::TransToProtoVarType(input->dtype()),
-          input_type, onednn_engine);
+          x_tz, framework::TransToProtoVarType(x->dtype()), x_type,
+          onednn_engine);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-          input->mem_desc(), platform::to_void_cast(input->data<T>()));
+          x->mem_desc(), platform::to_void_cast(x->data<T>()));
 
-      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          output, input->mem_desc(), ctx.GetPlace());
+      // reuse mem desc since it is a simple copy
+      auto reorder_dst_memory_p =
+          reorder_handler.AcquireDstMemory(out, x->mem_desc(), ctx.GetPlace());
 
       auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
                                                       reorder_dst_memory_p);
@@ -88,15 +89,15 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
 
-      output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
-          phi::vectorize<int64_t>(output->dims())));
+      out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(out->dims())));
     } else {
       platform::ReductionMKLDNNHandler<T> handler(reduction_type, 0.0f, 0.0f,
                                                   onednn_engine, ctx.GetPlace(),
-                                                  input, output, output_dims);
+                                                  x, out, out_tz);
 
-      auto src_memory_p = handler.AcquireSrcMemory(input);
-      auto dst_memory_p = handler.AcquireDstMemory(output);
+      auto src_memory_p = handler.AcquireSrcMemory(x);
+      auto dst_memory_p = handler.AcquireDstMemory(out);
 
       std::unordered_map<int, dnnl::memory> reduction_args = {
           {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
@@ -105,8 +106,9 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
 
       reduction_p->execute(astream, reduction_args);
       astream.wait();
-      output->set_mem_desc(dst_memory_p->get_desc().reshape(
-          phi::vectorize<int64_t>(output->dims())));
+
+      out->set_mem_desc(dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(out->dims())));
     }
   }
 };
@@ -127,22 +129,15 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
     const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const auto input_dims =
-        CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
-    const auto output_dims = phi::vectorize(dx->dims());
-
-    auto dout_mem_desc = dout->mem_desc();
-
-    if (input_dims != output_dims) {
-      dout_mem_desc = dout_mem_desc.reshape(input_dims);
-    }
+    auto dout_tz = CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
+    auto dx_tz = phi::vectorize(dx->dims());
 
-    platform::BroadcastDataMKLDNNHandler<T> handler(
-        binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y,
-        dout_mem_desc);
+    platform::BroadcastDataMKLDNNHandler<T> handler(binary_type, onednn_engine,
+                                                    ctx.GetPlace(), dout, dx,
+                                                    scale_x, scale_y, dout_tz);
 
     const auto src_memory_p = handler.AcquireSrcMemory(dout);
-    const auto dst_memory_p = handler.AcquireDstMemory(dx);
+    const auto dst_memory_p = handler.AcquireZeroedDstMemory(dx);
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 13b5005a30fa0..5476d244f6035 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -616,29 +616,17 @@ class BinaryMKLDNNHandler
  public:
   BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
                       const dnnl::engine engine, platform::Place cpu_place,
-                      const Tensor* x, const Tensor* y, Tensor* z,
-                      float scale_x, float scale_y, float scale_z,
+                      const Tensor* x, const Tensor* y, Tensor* out,
+                      float scale_x, float scale_y, float scale_out,
                       const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d",
-            DataLayout::kMKLDNN, x->layout()));
-
-    PADDLE_ENFORCE_EQ(
-        y->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d",
-            DataLayout::kMKLDNN, y->layout()));
-
     const auto src_x_tz = phi::vectorize(x->dims());
     const auto src_y_tz = phi::vectorize(y->dims());
     // if output tensor(z) is nullptr then we are computing into oneDNN
     // managed buffer
     auto rankdiff = x->dims().size() - y->dims().size();
-    const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
-                                       : phi::vectorize(z->dims());
+    const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
+                                         : phi::vectorize(out->dims());
 
     auto src0_md = x->mem_desc();
     auto src1_md = y->mem_desc();
@@ -667,7 +655,7 @@ class BinaryMKLDNNHandler
                                      MKLDNNMemoryFormat::any);
 
     auto attributes =
-        CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops);
+        CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
 
     this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md,
                                             dst_md);
@@ -681,7 +669,7 @@ class BinaryMKLDNNHandler
 
  private:
   static inline dnnl::primitive_attr CreateAttributes(
-      dnnl::algorithm op, float scale_x, float scale_y, float scale_z,
+      dnnl::algorithm op, float scale_x, float scale_y, float scale_out,
       dnnl::post_ops post_ops = dnnl::post_ops{}) {
     // Scales set in attributes for inputs contibute to the output equation
     // in the following way (assuming no broadcasting takes place):
@@ -699,9 +687,9 @@ class BinaryMKLDNNHandler
     // For mul operation on the other hand
     // output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
     //                <scale_0>                 <scale_1>
-    float scale_0 = scale_z / scale_x;
+    float scale_0 = scale_out / scale_x;
     float scale_1 =
-        op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y;
+        op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y;
     dnnl::primitive_attr attributes;
     attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0,
                           {scale_0});
@@ -718,21 +706,15 @@ class BroadcastDataMKLDNNHandler
  public:
   BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
                              const dnnl::engine engine,
-                             platform::Place cpu_place, const Tensor* out,
-                             const Tensor* x, float scale_x, float scale_y,
-                             const dnnl::memory::desc& x_mem_desc)
+                             platform::Place cpu_place, const Tensor* x,
+                             Tensor* out, float scale_x, float scale_y,
+                             const std::vector<int64_t>& extended_x_dims)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-
     const auto src0_tz = phi::vectorize(out->dims());
-
     const auto src0_md =
         dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType<T>(),
                            platform::GetPlainMKLDNNFormat(src0_tz.size()));
-
-    const auto src1_md = x_mem_desc;
+    const auto src1_md = x->mem_desc().reshape(extended_x_dims);
 
     dnnl::primitive_attr attributes;
     attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
@@ -743,9 +725,9 @@ class BroadcastDataMKLDNNHandler
   }
 
   template <typename T_out = T>
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output) {
-    T_out* ptr = output->mutable_data<T_out>(
-        this->place_, this->fwd_pd_->dst_desc().get_size());
+  std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(framework::Tensor* out) {
+    T_out* ptr = out->mutable_data<T_out>(this->place_,
+                                          this->fwd_pd_->dst_desc().get_size());
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -758,22 +740,18 @@ class ReductionMKLDNNHandler
   ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
                          const float eps, const dnnl::engine engine,
                          platform::Place cpu_place, const Tensor* x,
-                         const Tensor* y, std::vector<int64_t> y_tz,
-                         const dnnl::primitive_attr& attr = NULL)
+                         const Tensor* out, std::vector<int64_t> out_tz,
+                         const dnnl::primitive_attr& attrs = NULL)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
                                                               cpu_place) {
-    PADDLE_ENFORCE_EQ(
-        x->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-
-    const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType<T>(),
-                                   dnnl::memory::format_tag::any);
+    const auto out_md = memory::desc(out_tz, platform::MKLDNNGetDataType<T>(),
+                                     dnnl::memory::format_tag::any);
 
-    if (attr)
-      this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md,
-                                              p, eps);
+    if (attrs)
+      this->AcquireForwardPrimitiveDescriptor(attrs, algo, x->mem_desc(),
+                                              out_md, p, eps);
     else
-      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p,
+      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), out_md, p,
                                               eps);
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
index 46ee2a14a2018..7b0bb706aece9 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -14,7 +14,7 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, skip_check_grad_ci
 import paddle.fluid as fluid
 import paddle
 
@@ -92,6 +92,17 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+@OpTestTool.skip_if_not_cpu()
+class TestReduceSum4DNoReduceSimpleCopyOneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'dim': tuple(), 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': np.copy(self.inputs['X'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")

From b779d2b8bb2dbe17987f7c490c487f3a430ea582 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 31 May 2022 11:27:12 +0800
Subject: [PATCH 082/109] fix slice plugin (#43110)

---
 .../tensorrt/plugin/slice_op_plugin.cu        | 46 ++++++-------------
 .../tensorrt/plugin/slice_op_plugin.h         |  6 +--
 2 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 4e6b82d2dc146..0a6d24f90722e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -56,8 +56,6 @@ SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
                          std::vector<int> axes, bool with_fp16)
     : starts_(starts), ends_(ends), axes_(axes) {
   with_fp16_ = with_fp16;
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
 }
 
 SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
@@ -66,15 +64,10 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
   DeserializeValue(&serial_data, &serial_length, &ends_);
   DeserializeValue(&serial_data, &serial_length, &axes_);
   DeserializeValue(&serial_data, &serial_length, &with_fp16_);
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
+  DeserializeValue(&serial_data, &serial_length, &offset_info_);
 }
 
-SlicePlugin::~SlicePlugin() {
-  cudaStreamDestroy(copy_stream_);
-  cudaEventDestroy(copy_event_);
-  cudaFree(offset_temp_data_);
-}
+SlicePlugin::~SlicePlugin() { cudaFree(offset_temp_data_); }
 
 SlicePlugin *SlicePlugin::clone() const TRT_NOEXCEPT {
   return new SlicePlugin(starts_, ends_, axes_, with_fp16_);
@@ -159,11 +152,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
   }
 
   cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
-                  copy_stream_);
-
-  cudaEventRecord(copy_event_, copy_stream_);
-  cudaStreamWaitEvent(stream, copy_event_, 0);
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
@@ -190,7 +179,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
 size_t SlicePlugin::getSerializationSize() const TRT_NOEXCEPT {
   return getBaseSerializationSize() + SerializedSize(starts_) +
          SerializedSize(ends_) + SerializedSize(axes_) +
-         SerializedSize(with_fp16_);
+         SerializedSize(with_fp16_) + SerializedSize(offset_info_);
 }
 
 void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
@@ -199,6 +188,7 @@ void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, ends_);
   SerializeValue(&buffer, axes_);
   SerializeValue(&buffer, with_fp16_);
+  SerializeValue(&buffer, offset_info_);
 }
 
 // Dynamic Plugin below.
@@ -209,8 +199,6 @@ SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
                                        bool with_fp16)
     : starts_(starts), ends_(ends), axes_(axes), decrease_axis_(decrease_axis) {
   with_fp16_ = with_fp16;
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
 }
 
 SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
@@ -220,13 +208,10 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
   DeserializeValue(&serialData, &serialLength, &axes_);
   DeserializeValue(&serialData, &serialLength, &decrease_axis_);
   DeserializeValue(&serialData, &serialLength, &with_fp16_);
-  cudaEventCreate(&copy_event_);
-  cudaStreamCreate(&copy_stream_);
+  DeserializeValue(&serialData, &serialLength, &offset_info_);
 }
 
 void SlicePluginDynamic::destroy() TRT_NOEXCEPT {
-  cudaStreamDestroy(copy_stream_);
-  cudaEventDestroy(copy_event_);
   cudaFree(offset_temp_data_);
   delete this;
 }
@@ -236,7 +221,7 @@ int SlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 size_t SlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
                 SerializedSize(axes_) + SerializedSize(decrease_axis_) +
-                SerializedSize(with_fp16_);
+                SerializedSize(with_fp16_) + SerializedSize(offset_info_);
 
   return size;
 }
@@ -247,6 +232,7 @@ void SlicePluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, axes_);
   SerializeValue(&buffer, decrease_axis_);
   SerializeValue(&buffer, with_fp16_);
+  SerializeValue(&buffer, offset_info_);
 }
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
@@ -361,23 +347,19 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     offsets[axes_[i]] = starts_[i];
   }
 
-  std::vector<int> offset_info;
+  offset_info_.resize(num_dims * 3);
   for (size_t i = 0; i < num_dims; ++i) {
-    offset_info.push_back(offsets[i]);
-    offset_info.push_back(extends[i]);
-    offset_info.push_back(seg_offsets[i]);
+    offset_info_[i * 3 + 0] = offsets[i];
+    offset_info_[i * 3 + 1] = extends[i];
+    offset_info_[i * 3 + 2] = seg_offsets[i];
   }
 
   if (offset_temp_data_ == nullptr) {
     cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
   }
 
-  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
-                  copy_stream_);
-
-  cudaEventRecord(copy_event_, copy_stream_);
-  cudaStreamWaitEvent(stream, copy_event_, 0);
+  cudaMemcpyAsync(offset_temp_data_, offset_info_.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 4c07f0be36864..6b50a52df1fe5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -64,8 +64,7 @@ class SlicePlugin : public PluginTensorRT {
   std::vector<int> ends_;
   std::vector<int> axes_;
   int* offset_temp_data_{nullptr};
-  cudaEvent_t copy_event_;
-  cudaStream_t copy_stream_;
+  std::vector<int> offset_info_;
 };
 
 class SlicePluginCreator : public TensorRTPluginCreator {
@@ -144,8 +143,7 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   std::vector<int> axes_;
   int decrease_axis_;
   int* offset_temp_data_{nullptr};
-  cudaEvent_t copy_event_;
-  cudaStream_t copy_stream_;
+  std::vector<int> offset_info_;
 };
 
 class SlicePluginDynamicCreator : public TensorRTPluginCreator {

From 2785f8762ed24316b71e9ae0dab4a639b01b19fe Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 31 May 2022 11:40:02 +0800
Subject: [PATCH 083/109] add embedding yaml (#43029)

* add embedding yaml

* fix infermeta bug

* fix bug of selected_rows infer_meta

* fix selected_rows

* add unittest
---
 paddle/phi/api/lib/api_custom_impl.cc         | 193 ++++++++++++++++++
 paddle/phi/api/lib/api_custom_impl.h          |  12 ++
 paddle/phi/infermeta/binary.cc                |  26 +++
 paddle/phi/infermeta/binary.h                 |   6 +
 paddle/phi/tests/api/CMakeLists.txt           |   1 +
 paddle/phi/tests/api/test_embedding_api.cc    | 119 +++++++++++
 .../unittests/test_lookup_table_v2_op.py      |   5 +-
 python/paddle/nn/functional/input.py          |   4 +-
 python/paddle/utils/code_gen/api.yaml         |   6 +
 python/paddle/utils/code_gen/backward.yaml    |   6 +
 10 files changed, 375 insertions(+), 3 deletions(-)
 create mode 100644 paddle/phi/tests/api/test_embedding_api.cc

diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index b6431fcbe690e..14746abf59494 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -638,6 +638,80 @@ Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   return out;
 }
 
+Tensor embedding_impl(const Tensor& x,
+                      const Tensor& weight,
+                      int64_t padding_idx,
+                      bool sparse) {
+  DataType kernel_data_type = ParseDataType(weight);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(weight);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "embedding API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  Tensor api_output;
+
+  if (phi::DenseTensor::classof(weight.impl().get())) {
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            "embedding",
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << "embedding API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = PrepareData(weight, kernel.InputAt(1), {});
+
+    auto* kernel_out = SetKernelOutput(kernel_key.backend(), &api_output);
+    phi::MetaTensor meta_out(kernel_out);
+
+    phi::EmbeddingInferMeta(MakeMetaTensor(*input_x),
+                            MakeMetaTensor(*input_weight),
+                            padding_idx,
+                            sparse,
+                            &meta_out);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::DenseTensor&,
+                                      int64_t,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    {
+      (*kernel_fn)(*dev_ctx, *input_x, *input_weight, padding_idx, kernel_out);
+    }
+  } else {
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            "sparse_weight_embedding",
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << "sparse_weight_embedding API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = TensorToSelectedRows(weight);
+
+    auto* kernel_out = SetKernelOutput(kernel_key.backend(), &api_output);
+    phi::MetaTensor meta_out(kernel_out);
+
+    phi::EmbeddingInferMeta(MakeMetaTensor(*input_x),
+                            MakeMetaTensor(*input_weight),
+                            padding_idx,
+                            sparse,
+                            &meta_out);
+
+    using kernel_signature = void (*)(const platform::DeviceContext&,
+                                      const phi::DenseTensor&,
+                                      const phi::SelectedRows&,
+                                      int64_t,
+                                      phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    {
+      (*kernel_fn)(*dev_ctx, *input_x, *input_weight, padding_idx, kernel_out);
+    }
+  }
+  return api_output;
+}
+
 std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis) {
@@ -1176,6 +1250,125 @@ void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
 }
 
+void embedding_grad_impl(const Tensor& x,
+                         const Tensor& weight,
+                         const Tensor& out_grad,
+                         int64_t padding_idx,
+                         bool sparse,
+                         Tensor* weight_grad) {
+  DataType kernel_data_type = ParseDataType(weight);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(weight);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "embedding_grad API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  if (phi::DenseTensor::classof(weight.impl().get())) {
+    std::string kernel_name =
+        sparse ? "embedding_sparse_grad" : "embedding_grad";
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            kernel_name,
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = PrepareData(weight, kernel.InputAt(1), {});
+    auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
+
+    if (sparse) {
+      auto* kernel_out =
+          SetSelectedRowsKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      meta_out.set_dims(input_weight->dims());
+      meta_out.set_dtype(input_weight->dtype());
+      kernel_out->set_height(input_weight->dims()[0]);
+
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::SelectedRows*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    } else {
+      auto* kernel_out = SetKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    }
+  } else {
+    std::string kernel_name = sparse ? "sparse_weight_embedding_sparse_grad"
+                                     : "sparse_weight_embedding_grad";
+    const auto& kernel =
+        phi::KernelFactory::Instance().SelectKernelOrThrowError(
+            kernel_name,
+            {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+    VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+    auto input_x = PrepareData(x, kernel.InputAt(0), {});
+    auto input_weight = TensorToSelectedRows(weight);
+    auto input_out_grad = PrepareData(out_grad, kernel.InputAt(2), {});
+
+    if (sparse) {
+      auto* kernel_out =
+          SetSelectedRowsKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::SelectedRows&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::SelectedRows*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    } else {
+      auto* kernel_out = SetKernelOutput(kernel_key.backend(), weight_grad);
+      phi::MetaTensor meta_out(kernel_out);
+      meta_out.set_dims(input_weight->GetCompleteDims());
+      meta_out.set_dtype(input_weight->dtype());
+      using kernel_signature = void (*)(const platform::DeviceContext&,
+                                        const phi::DenseTensor&,
+                                        const phi::SelectedRows&,
+                                        const phi::DenseTensor&,
+                                        int64_t,
+                                        phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *input_x,
+                   *input_weight,
+                   *input_out_grad,
+                   padding_idx,
+                   kernel_out);
+    }
+  }
+}
+
 void real_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index f8ccbb36c5ca7..f700345f4699d 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -98,6 +98,11 @@ Tensor conv2d_impl(const Tensor& input,
 
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
+Tensor embedding_impl(const Tensor& x,
+                      const Tensor& weight,
+                      int64_t padding_idx,
+                      bool sparse);
+
 std::vector<Tensor> split_impl(const Tensor& x,
                                const IntArray& num_or_sections,
                                const Scalar& axis);
@@ -145,6 +150,13 @@ void conv2d_grad_impl(const Tensor& input,
 
 void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
+void embedding_grad_impl(const Tensor& x,
+                         const Tensor& weight,
+                         const Tensor& out_grad,
+                         int64_t padding_idx,
+                         bool sparse,
+                         Tensor* weight_grad);
+
 void real_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 }  // namespace experimental
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 76b6fcdd52efc..a8d5ad564fe9b 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -983,6 +983,32 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void EmbeddingInferMeta(const MetaTensor& x,
+                        const MetaTensor& weight,
+                        int64_t padding_idx,
+                        bool sparse,
+                        MetaTensor* out) {
+  const auto& table_dims = weight.dims();
+  const auto& ids_dims = x.dims();
+  int ids_rank = ids_dims.size();
+  VLOG(5) << "ids rank is " << ids_rank << std::endl;
+  PADDLE_ENFORCE_EQ(
+      table_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "ShapeError: The dimensions of the 'lookup table' must be 2. "
+          "But received lookup table's dimensions = %d, "
+          "lookup table's shape = [%s].",
+          table_dims.size(),
+          table_dims));
+
+  auto output_dims = phi::vectorize(ids_dims);
+  output_dims.push_back(table_dims[1]);
+  out->set_dims(phi::make_ddim(output_dims));
+  out->set_dtype(weight.dtype());
+  out->share_lod(x);
+}
+
 void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
                        const std::vector<int>& target_shape,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 0c86e5389c4b4..2cd34406fc2d2 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -154,6 +154,12 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              int axis,
                              MetaTensor* out);
 
+void EmbeddingInferMeta(const MetaTensor& x,
+                        const MetaTensor& weight,
+                        int64_t padding_idx,
+                        bool sparse,
+                        MetaTensor* out);
+
 void ExpandAsInferMeta(const MetaTensor& x,
                        const MetaTensor& y,
                        const std::vector<int>& target_shape,
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index 5c1d0989629dc..2333f82d626c4 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -15,6 +15,7 @@ cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
 cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_embedding_api SRCS test_embedding_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/test_embedding_api.cc b/paddle/phi/tests/api/test_embedding_api.cc
new file mode 100644
index 0000000000000..6ccd382786bd1
--- /dev/null
+++ b/paddle/phi/tests/api/test_embedding_api.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(sparse_weight_embedding, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_weight_embedding_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_weight_embedding_sparse_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(empty, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace tests {
+
+TEST(API, sparse_weight_embedding) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out = paddle::experimental::embedding(x, weight);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 4);
+  ASSERT_EQ(out.numel(), 12);
+  ASSERT_EQ(out.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
+}
+
+TEST(API, sparse_weight_embedding_grad) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out_grad = paddle::experimental::full({4, 3}, 1, DataType::FLOAT32);
+
+  paddle::experimental::Tensor weight_grad;
+
+  paddle::experimental::embedding_grad(
+      x, weight, out_grad, -1, false, &weight_grad);
+
+  // 3. check result
+  ASSERT_EQ(weight_grad.dims().size(), 2);
+  ASSERT_EQ(weight_grad.dims()[0], 16);
+  ASSERT_EQ(weight_grad.numel(), 48);
+  ASSERT_EQ(weight_grad.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(weight_grad.layout(), phi::DataLayout::NCHW);
+}
+
+TEST(API, sparse_weight_embedding_sparse_grad) {
+  auto x = paddle::experimental::empty({4}, DataType::INT32);
+  auto* x_data = x.data<int32_t>();
+  x_data[0] = 0;
+  x_data[1] = 4;
+  x_data[2] = 3;
+  x_data[3] = 1;
+
+  auto weight_sr = std::make_shared<phi::SelectedRows>(
+      std::vector<int64_t>{0, 1, 2, 3, 4, 5, 6}, 16);
+  *weight_sr->mutable_value() = *static_cast<phi::DenseTensor*>(
+      paddle::experimental::full({7, 3}, 2, DataType::FLOAT32).impl().get());
+  paddle::experimental::Tensor weight;
+  weight.set_impl(weight_sr);
+
+  auto out_grad = paddle::experimental::full({4, 3}, 1, DataType::FLOAT32);
+
+  paddle::experimental::Tensor weight_grad;
+
+  paddle::experimental::embedding_grad(
+      x, weight, out_grad, -1, true, &weight_grad);
+
+  // 3. check result
+  ASSERT_EQ(weight_grad.dims().size(), 2);
+  ASSERT_EQ(weight_grad.dims()[0], 4);
+  ASSERT_EQ(weight_grad.numel(), 12);
+  ASSERT_EQ(weight_grad.type(), phi::DataType::FLOAT32);
+  ASSERT_EQ(weight_grad.layout(), phi::DataLayout::NCHW);
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index cad6437d1d3e3..21844c9e402ad 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -48,6 +48,7 @@ def test_main(self):
 class TestLookupTableOp(OpTest):
     def setUp(self):
         self.op_type = "lookup_table_v2"
+        self.python_api = paddle.nn.functional.embedding
         table = np.random.random((17, 31)).astype("float64")
         ids = np.random.randint(0, 17, 4).astype(self.id_dtype())
         self.inputs = {'W': table, 'Ids': ids}
@@ -57,10 +58,10 @@ def id_dtype(self):
         return "int64"
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_eager=True)
 
 
 class TestLookupTableOpInt16(OpTest):
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index cfbf015ffa05f..92b3a7054d467 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -200,7 +200,9 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         raise ValueError("padding_idx must be within [-{}, {})".format(
             weight.shape[0], weight.shape[0]))
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_embedding(x, weight, padding_idx, sparse)
+    elif _in_legacy_dygraph():
         return _C_ops.lookup_table_v2(
             weight, x, 'is_sparse', sparse, 'is_distributed', False,
             'remote_prefetch', False, 'padding_idx', padding_idx)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index c541891662864..c3a8e68ca7b0b 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -613,6 +613,12 @@
     func : elu
   backward : elu_grad
 
+- api : embedding
+  args : (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false)
+  output : Tensor
+  invoke : embedding_impl(x, weight, padding_idx, sparse)
+  backward : embedding_grad
+
 - api : empty
   args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index b27c3aab6bb37..7183d822e15c0 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -609,6 +609,12 @@
   backward : elu_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : embedding_grad
+  forward : embedding (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false) -> Tensor(out)
+  args : (Tensor x, Tensor weight, Tensor out_grad, int64_t padding_idx=-1, bool sparse=false)
+  output : Tensor(weight_grad)
+  invoke : embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad)
+
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)

From ae45d981181b44783c61a21d808b54cc5148dc02 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Tue, 31 May 2022 12:01:09 +0800
Subject: [PATCH 084/109] [Eager] fix collective_global_gather (#43090)

* [Eager] fix collective_global_gather

* fix eager_ode = 1
---
 .../paddle/fluid/tests/unittests/collective_global_gather.py  | 4 ++++
 .../paddle/fluid/tests/unittests/test_collective_api_base.py  | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective_global_gather.py
index d3a6071ed04df..164abe0593491 100644
--- a/python/paddle/fluid/tests/unittests/collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/collective_global_gather.py
@@ -23,6 +23,7 @@
 import paddle.fluid.layers as layers
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 import pickle
+from paddle.fluid.framework import _enable_legacy_dygraph
 
 paddle.enable_static()
 
@@ -74,6 +75,9 @@ def run_trainer(self, args):
         world_size = 2
         tot_expert = n_expert * world_size
         paddle.disable_static()
+
+        # Call paddle.distributed.alltoall() under legacy dygraph
+        _enable_legacy_dygraph()
         np.random.seed(os.getpid())
         local_expert_count = np.random.randint(
             1, 4, size=tot_expert).astype("int64")
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index dbd982947265f..a4e71db3d3850 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -219,9 +219,9 @@ def check_with_place(self,
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
 
         if eager_mode:
-            required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
-        else:
             required_envs["FLAGS_enable_eager_mode"] = "%d" % 1
+        else:
+            required_envs["FLAGS_enable_eager_mode"] = "%d" % 0
 
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file,
                                                          required_envs)

From 6749711976817c1df3d57733b8699a5b6855e933 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Tue, 31 May 2022 12:44:48 +0800
Subject: [PATCH 085/109] Rename dropout is test (#43098)

* replace dropout_is_test with is_test.
* improve atol on a100.
---
 .../operators/fused/fused_attention_op.cc     | 18 +++++--------
 .../operators/fused/fused_attention_op.cu     |  4 +--
 ...sed_bias_dropout_residual_layer_norm_op.cc | 11 ++++----
 .../operators/fused/fused_dropout_helper.h    |  2 +-
 .../operators/fused/fused_feedforward_op.cc   | 13 +++-------
 .../fused/fused_multi_transformer_op.cc       |  2 +-
 .../unittests/test_fused_attention_op.py      | 25 +++++++++++++++----
 .../unittests/test_fused_attention_op_api.py  | 16 ++++++++++--
 .../unittests/test_fused_feedforward_op.py    |  7 +++++-
 .../test_fused_transformer_encoder_layer.py   | 14 ++++++++---
 .../nn/functional/fused_transformer.py        | 18 ++++++-------
 11 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 1f377810a2287..a1adec9641a6e 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -194,7 +194,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     // the same as QKOut's shape.
     ctx->SetOutputDim("AttnDropoutOut",
                       {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
-    if (ctx->Attrs().Get<bool>("attn_dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("AttnDropoutMaskOut",
                         {x_dim[0], y_dim[1], x_dim[1], out_seq_len});
     }
@@ -206,7 +206,7 @@ class FusedAttentionOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]});
     ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X"));
 
-    if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
     }
 
@@ -301,7 +301,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
               platform::errors::InvalidArgument(
                   "'attn_dropout_rate' must be between 0.0 and 1.0."));
         });
-    AddAttr<bool>("attn_dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
@@ -345,11 +345,6 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                             platform::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
-
-    AddAttr<bool>("dropout_is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training. Some layers may run faster when this is true.")
-        .SetDefault(false);
     AddAttr<bool>("dropout_fix_seed",
                   "A flag indicating whether to use a fixed seed to generate "
                   "random mask. NOTE: DO NOT set this flag to true in "
@@ -418,10 +413,9 @@ class FusedAttentionGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->Attrs().Get<bool>("attn_dropout_is_test"), false,
-        platform::errors::InvalidArgument(
-            "GradOp is only callable when attn_dropout_is_test is false"));
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
 
     if (ctx->Attrs().Get<bool>("pre_layer_norm") == false) {
       OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean",
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index ec8a4d962e808..f25bd53992894 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -109,7 +109,7 @@ class FusedAttentionOpKernel : public framework::OpKernel<T> {
     const float ln_epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_rate = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
@@ -280,7 +280,7 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
     const float ln2epsilon = ctx.Attr<float>("ln_epsilon");
 
     float attn_dropout_prob = ctx.Attr<float>("attn_dropout_rate");
-    bool is_test_1 = ctx.Attr<bool>("attn_dropout_is_test");
+    bool is_test_1 = ctx.Attr<bool>("is_test");
     auto &dropout_implementation_1 =
         ctx.Attr<std::string>("attn_dropout_implementation");
     bool is_upscale_in_train_1 =
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 6187544456b37..781f51d70ec66 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -44,7 +44,7 @@ class FusedBiasDropoutResidualLnOp : public framework::OperatorWithKernel {
       left *= x_dim[i];
     }
     ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X"));
-    if (ctx->Attrs().Get<bool>("dropout_is_test") == false) {
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
       ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X"));
     }
     ctx->SetOutputDim("LnMean", {left});
@@ -91,7 +91,7 @@ class FusedBiasDropoutResidualLnOpMaker
                             platform::errors::InvalidArgument(
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
-    AddAttr<bool>("dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
@@ -140,10 +140,9 @@ class FusedBiasDropoutResidualLnGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->Attrs().Get<bool>("dropout_is_test"), false,
-        platform::errors::InvalidArgument(
-            "GradOp is only callable when dropout_is_test is false"));
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      platform::errors::InvalidArgument(
+                          "GradOp is only callable when is_test is false"));
     OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean",
                    "FusedBiasDropoutResidualLnGrad");
     OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance",
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 0a33a60f8123d..c352f08ec2ba7 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -82,7 +82,7 @@ struct DropoutParam {
     auto& dropout_implementation =
         context.Attr<std::string>(pre_fix + "implementation");
     is_upscale_in_train = (dropout_implementation == "upscale_in_train");
-    is_test = context.Attr<bool>(pre_fix + "is_test");
+    is_test = context.Attr<bool>("is_test");
     fix_seed = context.Attr<bool>(pre_fix + "fix_seed");
 
     std::string str_seed = "Dropout";
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index f3f8f17427577..8e15232acda90 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -61,14 +61,14 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
     tmp_dim_x[dim_x.size() - 1] =
         dim_Linear1Weight[dim_Linear1Weight.size() - 1];
     context->SetOutputDim("Out", dim_x);
-    if (context->Attrs().Get<bool>("dropout1_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout1Mask", tmp_dim_x);
     }
     context->SetOutputDim("Dropout1Out", tmp_dim_x);
     context->SetOutputDim("Linear1Out", tmp_dim_x);
     context->SetOutputDim("Dropout2Out", dim_x);
 
-    if (context->Attrs().Get<bool>("dropout2_is_test") == false) {
+    if (context->Attrs().Get<bool>("is_test") == false) {
       context->SetOutputDim("Dropout2Mask", dim_x);
     }
     framework::DDim mean_dim =
@@ -185,9 +185,7 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker {
                   "dropout2_implementation can only be downgrade_in_infer or "
                   "upscale_in_train"));
         });
-    AddAttr<bool>("dropout1_is_test", "the is_test of first dropout")
-        .SetDefault(false);
-    AddAttr<bool>("dropout2_is_test", "the is_test of second dropout")
+    AddAttr<bool>("is_test", "the is_test attribute of dropout")
         .SetDefault(false);
     AddAttr<bool>("dropout1_fix_seed", "the is_test of first dropout")
         .SetDefault(false);
@@ -218,10 +216,7 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout1_is_test"), false,
-                      platform::errors::InvalidArgument(
-                          "GradOp is only callable when is_test is false"));
-    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("dropout2_is_test"), false,
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
                       platform::errors::InvalidArgument(
                           "GradOp is only callable when is_test is false"));
     bool pre_layer_norm = ctx->Attrs().Get<bool>("pre_layer_norm");
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
index c95ca6fe0c96c..98602e4edd0a2 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc
@@ -221,7 +221,7 @@ class FusedMultiTransformerOpOpMaker
                                 "'dropout_rate' must be between 0.0 and 1.0."));
         });
 
-    AddAttr<bool>("dropout_is_test",
+    AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index 67160f59952ef..445620f9e1cb1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -36,6 +36,18 @@ class TestFusedAttentionOp(OpTest):
     def setUp(self):
         self.config()
         self.generate_input_data()
+
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
         paddle.set_default_dtype(self.x_type)
         self.__class__.op_type = "fused_attention"
         # use autograd to check grad in this unittest.
@@ -274,9 +286,9 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-4)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpBiasIsNone(TestFusedAttentionOp):
@@ -307,9 +319,9 @@ def test_fused_attention_op(self):
         final_out_ref, x_grad_ref = self.GetBaselineOut()
         final_out, x_grad = self.GetFusedAttentionOut()
         np.testing.assert_allclose(
-            final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-1)
+            final_out_ref, final_out.numpy(), rtol=self.rtol, atol=self.atol)
         np.testing.assert_allclose(
-            x_grad_ref, x_grad.numpy(), rtol=1e-5, atol=1e-1)
+            x_grad_ref, x_grad.numpy(), rtol=self.rtol, atol=self.atol)
 
 
 class TestFusedAttentionOpCacheKV(TestFusedAttentionOp):
@@ -325,7 +337,10 @@ def test_fused_attention_op(self):
             final_out_ref = self.GetBaselineOut()
             final_out, cache_kv_out = self.GetFusedAttentionOut()
             np.testing.assert_allclose(
-                final_out_ref, final_out.numpy(), rtol=1e-5, atol=1e-4)
+                final_out_ref,
+                final_out.numpy(),
+                rtol=self.rtol,
+                atol=self.atol)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
index bdaf32ee0726d..74dc9351a25b4 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py
@@ -173,6 +173,17 @@ def setUp(self):
         self.config()
         self.generate_input_data()
 
+        self.rtol = 1e-5
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
     def setAttnMask(self):
         self.has_attn_mask = True
 
@@ -256,7 +267,8 @@ def run_imperative(self):
             fused_attn.ln_scale.numpy(), fused_attn_ln_bias,
             fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias,
             fused_attn.linear_weight.numpy(), fused_attn_linear_bias)
-        np.testing.assert_allclose(ref_out, out.numpy(), rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(
+            ref_out, out.numpy(), rtol=self.rtol, atol=self.atol)
 
     def run_static(self):
         fused_attn = FusedMultiHeadAttention(
@@ -341,7 +353,7 @@ def test_static_api(self):
                                     self.attn_mask, ln_scale, ln_bias,
                                     ln_2_scale, ln_2_bias, qkv_weight, qkv_bias,
                                     linear_weight, linear_bias)
-        np.testing.assert_allclose(ref_out, out, rtol=1e-5, atol=1e-4)
+        np.testing.assert_allclose(ref_out, out, rtol=self.rtol, atol=self.atol)
 
     def test_dynamic_api(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index 8c68eb243aea8..25336efd6a7fb 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -40,7 +40,12 @@ def getShape(self):
 
     def getDiff(self):
         self.rtol = 1e-3
-        self.atol = 1e-4
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
 
     def getActivation(self):
         self.act_method = "gelu"
diff --git a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
index 7dc86d0dea382..843b495e85b9a 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_transformer_encoder_layer.py
@@ -49,6 +49,14 @@ def setUp(self):
         self.setPreLayerNorm()
         self.setAttnMask()
 
+        self.rtol = 1e-3
+        # FIXME(limin29): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+
     def fused_weight(self, weight, num_head):
         a = paddle.transpose(weight, perm=[1, 0])
         return paddle.reshape(
@@ -151,13 +159,13 @@ def test_out(self):
         self.assertTrue(fused_encoder.fused_attn.extra_repr(), correct_attn_str)
 
         np.testing.assert_allclose(
-            fused_out.numpy(), base_out.numpy(), rtol=1e-3, atol=1e-4)
+            fused_out.numpy(), base_out.numpy(), rtol=self.rtol, atol=self.atol)
         self.assertTrue(
             np.allclose(
                 fused_out.grad.numpy(),
                 base_out.grad.numpy(),
-                rtol=1e-3,
-                atol=1e-4))
+                rtol=self.rtol,
+                atol=self.atol))
 
 
 class TestFusedTransformerEncoderLayerAct(TestFusedTransformerEncoderLayer):
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index ee85642d41664..232e16415a5f7 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -298,7 +298,7 @@ def fused_bias_dropout_residual_layer_norm(x,
             seed = default_main_program().random_seed
         _, _, _, _, final_out = _C_ops.fused_bias_dropout_residual_layer_norm(
             x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
-            'ln_epsilon', ln_epsilon, 'dropout_is_test', not training,
+            'ln_epsilon', ln_epsilon, 'is_test', not training,
             'dropout_fix_seed', seed is not None, 'dropout_seed', seed
             if seed is not None else 0, 'dropout_implementation', mode)
         return final_out
@@ -327,7 +327,7 @@ def fused_bias_dropout_residual_layer_norm(x,
         attrs = {
             'ln_epsilon': ln_epsilon,
             'dropout_rate': dropout_rate,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'dropout_fix_seed': seed is not None,
             'dropout_seed': seed if seed is not None else 0,
             'dropout_implementation': mode,
@@ -513,10 +513,9 @@ def fused_multi_head_attention(x,
             attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
             'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
             'dropout_rate', dropout_rate, 'attn_dropout_rate',
-            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'attn_dropout_is_test',
-            not training, 'dropout_is_test', not training,
-            'attn_dropout_fix_seed', seed is not None, 'dropout_fix_seed',
-            seed is not None, 'attn_dropout_seed', seed
+            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
+            not training, 'attn_dropout_fix_seed', seed is not None,
+            'dropout_fix_seed', seed is not None, 'attn_dropout_seed', seed
             if seed is not None else 0, 'dropout_seed', seed
             if seed is not None else 0, 'attn_dropout_implementation', mode,
             'dropout_implementation', mode, 'ring_id', ring_id)
@@ -562,8 +561,7 @@ def fused_multi_head_attention(x,
             'ln_epsilon': ln_epsilon,
             'dropout_rate': dropout_rate,
             'attn_dropout_rate': attn_dropout_rate,
-            'attn_dropout_is_test': not training,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'attn_dropout_fix_seed': seed is not None,
             'dropout_fix_seed': seed is not None,
             'attn_dropout_seed': seed if seed is not None else 0,
@@ -801,7 +799,7 @@ def fused_multi_transformer(x,
             time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
             ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
             cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
-            'dropout_rate', dropout_rate, 'dropout_is_test', not training,
+            'dropout_rate', dropout_rate, 'is_test', not training,
             'dropout_implementation', mode, 'act_method', activation, 'ring_id',
             ring_id)
         if cache_kvs is not None:
@@ -848,7 +846,7 @@ def fused_multi_transformer(x,
             'pre_layer_norm': pre_layer_norm,
             'epsilon': epsilon,
             'dropout_rate': dropout_rate,
-            'dropout_is_test': not training,
+            'is_test': not training,
             'dropout_implementation': mode,
             'act_method': activation,
             'ring_id': ring_id

From 4700a08e99d232d2597a135ec655252f4a29cdd6 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Tue, 31 May 2022 13:41:44 +0800
Subject: [PATCH 086/109] Support backward prune for eager intermidiate
 (#43111)

* support is empty

* fix error

* fix code error

* change to fake empty

* using fake empty first

* using fake empty first

* Support backward prune in fluid
---
 .../auto_code_generator/eager_generator.cc    | 67 ++++++++++++-------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 521b952a4dfcd..3a9bac833d588 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -2189,7 +2189,6 @@ static std::string GenerateSingleOpBase(
   }
 
   VLOG(6) << "Generated Ins Map";
-
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
   for (auto iter : grad_outs) {
@@ -2238,9 +2237,12 @@ static std::string GenerateSingleOpBase(
         size_t grads_position = fwd_outputs_name_pos_map.at(fwd_name);
 
         const char* GRAD_OUTS_CONTENT_TEMPLATE =
-            "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
+            " if((!out_metas[%d].empty()) && "
+            "(!(out_metas[%d][0].IsStopGradient()))){ \n %s.insert({ \"%s\", "
+            "egr::EagerUtils::TrySyncToVars(hooked_grads[%d])});} \n ";
         outs_contents_str += paddle::string::Sprintf(
-            GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, grads_position);
+            GRAD_OUTS_CONTENT_TEMPLATE, grads_position, grads_position,
+            outs_name, grad_output_name, grads_position);
 
       } else {
         if (dispensable_input_name_set.count(fwd_name) &&
@@ -2251,18 +2253,20 @@ static std::string GenerateSingleOpBase(
         if (duplicable_input_name_set.count(fwd_name) &&
             !is_op_base_per_duplicable_input) {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", egr::EagerUtils::CreateVars( "
-              "this->OutputMeta()[%d].size() ) },";
+              " if(!out_metas[%d].empty()){ %s.insert({ \"%s\", "
+              "egr::EagerUtils::CreateVars(out_metas[%d].size())});} \n ";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position);
+              GRAD_OUTS_CONTENT_TEMPLATE, fwd_input_position, outs_name,
+              grad_output_name, fwd_input_position);
         } else {
           const char* GRAD_OUTS_CONTENT_TEMPLATE =
-              "{ \"%s\", "
+              " if((!out_metas[%d].empty()) && "
+              "(!(out_metas[%d][0].IsStopGradient()))){ %s.insert({ \"%s\", "
               "{std::make_shared<egr::EagerVariable>(egr::Controller::Instance("
-              ")."
-              "GenerateUniqueName())}},";
+              ").GenerateUniqueName())}});} \n ";
           outs_contents_str += paddle::string::Sprintf(
-              GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name);
+              GRAD_OUTS_CONTENT_TEMPLATE, fwd_input_position,
+              fwd_input_position, outs_name, grad_output_name);
         }
       }
     } else {
@@ -2272,16 +2276,15 @@ static std::string GenerateSingleOpBase(
           grad_output_name));
     }
   }
-  if (outs_contents_str.size() > 0)
-    outs_contents_str.pop_back();  // // Remove trailing ","
 
   const char* BWD_OUTS_MAP_TEMPLATE =
       "  std::map<std::string, "
-      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s = { "
-      "%s };\n";
-  std::string outs_map_str = paddle::string::Sprintf(
-      BWD_OUTS_MAP_TEMPLATE, outs_name, outs_contents_str);
+      "std::vector<std::shared_ptr<egr::EagerVariable>>> %s;\n";
+  std::string outs_map_str =
+      paddle::string::Sprintf(BWD_OUTS_MAP_TEMPLATE, outs_name);
+
   generated_grad_function_body += outs_map_str;
+  generated_grad_function_body += outs_contents_str;
   generated_grad_function_body += "\n";
   for (auto iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
@@ -2296,18 +2299,23 @@ static std::string GenerateSingleOpBase(
               !is_op_base_per_duplicable_input) {
             size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
-                "  if(%s.size() > 0) %s[\"%s\"] = egr::EagerUtils::CreateVars( "
-                "this->OutputMeta()[%d].size() );\n";
+                "  if((%s.size() > 0) && (!out_metas[%d].empty()) && "
+                "(!out_metas[%d][0].IsStopGradient())) %s[\"%s\"] = "
+                "egr::EagerUtils::CreateVars( "
+                "out_metas[%d].size() );\n";
             generated_grad_function_body += paddle::string::Sprintf(
                 DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
                 grad_output_name, fwd_input_position);
           } else {
+            size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
             const char* DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE =
-                "  if(%s.defined()) %s[\"%s\"] = "
+                "  if(%s.defined() && (!out_metas[%d].empty()) && "
+                "(!out_metas[%d][0].IsStopGradient())) %s[\"%s\"] = "
                 "{std::make_shared<egr::EagerVariable>(egr::Controller::"
                 "Instance().GenerateUniqueName())};\n";
             generated_grad_function_body += paddle::string::Sprintf(
-                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name, outs_name,
+                DISPENSABLE_GRAD_OUTS_FWD_CONTENT_TEMPLATE, fwd_name,
+                fwd_input_position, fwd_input_position, outs_name,
                 grad_output_name);
           }
         }
@@ -2387,16 +2395,20 @@ static std::string GenerateSingleOpBase(
       size_t fwd_input_position = fwd_inputs_name_pos_map.at(fwd_name);
       if (!is_op_base_per_duplicable_input) {
         const char* BWD_OUTPUT_TEMPLATE =
-            "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+            " if (%s.find(\"%s\") != %s.end()) { outputs[%d] = "
+            "egr::EagerUtils::GetOutputs(%s[\"%s\"]); }\n";
         outputs_str += paddle::string::Sprintf(
-            BWD_OUTPUT_TEMPLATE, fwd_input_position, outs_name, grad_out_name);
+            BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name, outs_name,
+            fwd_input_position, outs_name, grad_out_name);
       } else {
         const char* BWD_OUTPUT_TEMPLATE =
             "  "
+            "if (%s.find(\"%s\") != %s.end()) { "
             "outputs[0].emplace_back(egr::EagerUtils::GetOutputs(%s[\"%s\"])[0]"
-            ");\n";
+            "); }\n";
         outputs_str += paddle::string::Sprintf(BWD_OUTPUT_TEMPLATE, outs_name,
-                                               grad_out_name);
+                                               grad_out_name, outs_name,
+                                               outs_name, grad_out_name);
       }
       num_appended_outputs++;
     } else {
@@ -2415,9 +2427,11 @@ static std::string GenerateSingleOpBase(
 
     if (fwd_outputs_name_pos_map.count(fwd_name)) {
       const char* BWD_OUTPUT_TEMPLATE =
-          "  outputs[%d] = egr::EagerUtils::GetOutputs(%s[\"%s\"]);\n";
+          "  if (%s.find(\"%s\") != %s.end()) { outputs[%d] = "
+          "egr::EagerUtils::GetOutputs(%s[\"%s\"]); }\n";
       outputs_str += paddle::string::Sprintf(
-          BWD_OUTPUT_TEMPLATE, num_appended_outputs, outs_name, grad_out_name);
+          BWD_OUTPUT_TEMPLATE, outs_name, grad_out_name, outs_name,
+          num_appended_outputs, outs_name, grad_out_name);
       num_appended_outputs++;
     }
   }
@@ -2550,6 +2564,7 @@ static std::string GenerateGradNodeCCContents(
       "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> hooked_grads = "
       "GradNode%s::ApplyGradientHooks(grads);\n"
+      "  const auto& out_metas = OutputMeta();\n"
       "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> outputs(%d);\n"
       "  %s\n"

From 0ae8a2d67623f33c13f2dc14141587619cc3ba7e Mon Sep 17 00:00:00 2001
From: Leo Chen <39020268+leo0519@users.noreply.github.com>
Date: Tue, 31 May 2022 14:21:14 +0800
Subject: [PATCH 087/109] Fix the underflow of fp16 fake quantize operators
 (#43088)

Co-authored-by: Ryan Jeng <rjeng@nvidia.com>
---
 paddle/fluid/operators/fake_quantize_op.cu.h  |  61 +-
 .../tests/unittests/test_fake_quantize_op.py  | 587 +++++++-----------
 2 files changed, 263 insertions(+), 385 deletions(-)

diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 6c068d25d07a8..a6130c272d72b 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -217,16 +217,18 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[0]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
+
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out[i] = static_cast<T>(round(v));
   }
 }
 
@@ -237,18 +239,19 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  T s = scale[0];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[0]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
 
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
     x = x > s ? s : x;
     x = x < -s ? -s : x;
     x = bin_cnt_t * inv_s * x;
-    x = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(x)));
-    out[i] = (x * s) / bin_cnt_t;
+    x = round(x);
+    out[i] = static_cast<T>((x * s) / bin_cnt_t);
   }
 }
 
@@ -302,17 +305,18 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
   const T* in_c = in + blockIdx.x * channel_size;
   T* out_c = out + blockIdx.x * channel_size;
 
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+
+  ComputeDataType s = static_cast<ComputeDataType>(scale[blockIdx.x]);
+  ComputeDataType inv_s = inverse(s);
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
 
   for (int64_t i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
-    T v = x > s ? s : x;
+    ComputeDataType x = static_cast<ComputeDataType>(in_c[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out_c[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out_c[i] = static_cast<T>(round(v));
   }
 }
 
@@ -322,16 +326,17 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
     const T* in, const T* scale, const int bin_cnt, const int64_t n,
     const int nScale, const int quant_stride, T* out) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-  T bin_cnt_t = static_cast<T>(bin_cnt);
+  using ComputeDataType = typename QuantizeDataType<T>::type;
+  ComputeDataType bin_cnt_t = static_cast<ComputeDataType>(bin_cnt);
   for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
-    T s = scale[(i / quant_stride) % nScale];
-    T inv_s = inverse(s);
-    T x = in[i];
-    T v = x > s ? s : x;
+    ComputeDataType s =
+        static_cast<ComputeDataType>(scale[(i / quant_stride) % nScale]);
+    ComputeDataType inv_s = inverse(s);
+    ComputeDataType x = static_cast<ComputeDataType>(in[i]);
+    ComputeDataType v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt_t * inv_s * v;
-    out[i] = static_cast<T>(
-        round(static_cast<typename QuantizeDataType<T>::type>(v)));
+    out[i] = static_cast<T>(round(v));
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 230bc15e0f1ab..0c8e115d7cebf 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,439 +15,312 @@
 from __future__ import print_function
 
 import unittest
-import math
+import itertools
 import numpy as np
 import math
 from op_test import OpTest
-import paddle.fluid.core as core
 
 
 # numpy.round has different behavior in comparision to c++ round function
 # so we use round_c instead of numpy.round to align the output data
-def round_c_single_element(x):
-    dtype = type(x)
-    if x >= 0:
-        return dtype(np.floor(x + 0.5))
-    else:
-        return dtype(np.ceil(x - 0.5))
+def round_c_single_element(val):
+    dtype = type(val)
+    if val >= 0:
+        return dtype(np.floor(val + 0.5))
+    return dtype(np.ceil(val - 0.5))
 
 
 round_c = np.vectorize(round_c_single_element)
 
 
-class TestFakeQuantizeOp(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype(self.dtype), }
-        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-        self.outputs = {
-            'Out': round_c(self.inputs['X'] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype(self.dtype),
-        }
-
-    def set_dtype(self):
-        self.dtype = np.float32
+def get_compute_type(dtype):
+    assert dtype in [np.float16, np.float32, np.float64]
+    if dtype == np.float16:
+        return np.float32
+    return dtype
 
-    def test_check_output(self):
-        self.check_output()
 
-
-class TestFakeQuantizeOpFloat16(TestFakeQuantizeOp):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeQuantizeOp1(OpTest):
+class TestFakeQuantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
+        self.op_type = 'fake_quantize_abs_max'
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.zeros((10, 10)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        inv_scale = 1.0 / (scale + 1e-6) if scale < 1e-30 else 1.0 / scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] * inv_scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
-        self.check_output()
 
-
-class TestFakeQuantizeOp2(OpTest):
-    def setUp(self):
-        self.op_type = "fake_quantize_abs_max"
-        self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.full((10, 10), 1e-40).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+    def _fake_quantize_abs_max(self, dtype, input_shape, distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        scale = np.max(np.abs(input_data))
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
         inv_scale = 1.0 / (scale + 1e-6) if scale < 1e-30 else 1.0 / scale
-        self.outputs = {
-            'Out': np.round(self.inputs['X'] * inv_scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
-        }
-
-    def test_check_output(self):
+        output_data = round_c(input_data.astype(compute_type) * inv_scale * bnt)
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
         self.check_output()
 
+    def test_fake_quantize_abs_max(self):
+        self._fake_quantize_abs_max(np.float32, (124, 240), np.random.random)
 
-class TestFakeChannelWiseQuantizeOp(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.set_arg()
-        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+    def test_fake_quantize_abs_max_float16(self):
+        self._fake_quantize_abs_max(np.float16, (124, 240), np.random.random)
 
-        self.op_type = "fake_channel_wise_quantize_abs_max"
-        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+    def test_fake_quantize_abs_max_underflow(self):
+        self._fake_quantize_abs_max(np.float32, (10, 10), np.zeros)
 
-        scales = []
-        outputs = self.inputs['X'].copy()
-        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
-        if self.quant_axis == 0:
-            for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype(self.dtype)
-                scales.append(scale_v)
-                outputs[i] = round_c(
-                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) * outputs[i])
-        elif self.quant_axis == 1:
-            for i in range(self.inputs['X'].shape[1]):
-                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    self.dtype)
-                scales.append(scale_v)
-                outputs[:, i] = round_c(
-                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) *
-                    outputs[:, i])
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype(self.dtype),
-        }
+    def test_fake_quantize_abs_max_underflow2(self):
+        self._fake_quantize_abs_max(np.float32, (10, 10),
+                                    lambda shape: np.full(shape, 1e-40))
 
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {
-            'X': np.random.random((20, 15, 6, 6)).astype(self.dtype),
-        }
 
-    def set_dtype(self):
-        self.dtype = np.float32
+class TestFakeChannelWiseQuantizeAbsMaxOp(OpTest):
+    def setUp(self):
+        self.op_type = 'fake_channel_wise_quantize_abs_max'
+        self.attrs = {'bit_length': 8}
 
-    def test_check_output(self):
+    def _fake_channel_wise_quantize_abs_max(self, dtype, input_shape,
+                                            quant_axis, distribution):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        compute_axis = tuple(
+            i for i in range(len(input_shape)) if i != quant_axis)
+        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
+        output_data = round_c(bnt * input_data.astype(compute_type) /
+                              scale_broadcast)
+        if quant_axis == 1:
+            scale_broadcast = np.transpose(scale_broadcast,
+                                           (1, ) + compute_axis)
+        scale = scale_broadcast.reshape(input_shape[quant_axis], -1)[:, 0]
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
+        self.attrs['quant_axis'] = quant_axis
         self.check_output()
 
-
-class TestFakeChannelWiseQuantizeOpFloat16(TestFakeChannelWiseQuantizeOp):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeChannelWiseQuantizeOp1(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 1
-        self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype(self.dtype),
-        }
-
-
-class TestFakeChannelWiseQuantizeOp1Float16(TestFakeChannelWiseQuantizeOp1):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-
-class TestFakeChannelWiseQuantizeOp2(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
-
-
-class TestFakeChannelWiseQuantizeOp3(TestFakeChannelWiseQuantizeOp):
-    def set_quant_axis(self):
-        self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
+    def test_fake_channel_wise_quantize_abs_max(self):
+        dtype_options = [np.float32, np.float16]
+        input_shape_quant_axis_options = [[(20, 15, 6, 6), 0],
+                                          [(15, 20, 5, 5), 1], [(30, 15), 0],
+                                          [(30, 15), 1]]
+        for dtype, input_shape_quant_axis in itertools.product(
+                dtype_options, input_shape_quant_axis_options):
+            input_shape, quant_axis = input_shape_quant_axis
+            with self.subTest(
+                    dtype=dtype, input_shape=input_shape,
+                    quant_axis=quant_axis):
+                self._fake_channel_wise_quantize_abs_max(
+                    dtype, input_shape, quant_axis, np.random.random)
 
 
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(5),
-            'window_size': int(1),
-            'is_test': False
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype(self.dtype)
+        self.op_type = 'fake_quantize_range_abs_max'
+        self.attrs = {'bit_length': 5, 'window_size': 1}
+
+    def _fake_quantize_range_abs_max(self,
+                                     dtype,
+                                     input_shape,
+                                     distribution,
+                                     is_test=False):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        in_scale = np.zeros(1).astype(dtype)
+        out_scale = np.zeros(self.attrs['window_size']).astype(dtype)
+        out_scale[0] = np.max(np.abs(input_data))
+        if is_test:
+            out_scale[0] = in_scale[0] = out_scale[0] - 1.0
+            clip_data = np.clip(input_data, -in_scale, in_scale)
+        else:
+            clip_data = input_data
+        output_data = round_c(
+            clip_data.astype(compute_type) / out_scale[0] * bnt)
         self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': np.zeros(1).astype(self.dtype)
+            'X': input_data,
+            'Iter': np.zeros(1).astype(np.int64),
+            'InScale': in_scale
         }
-        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-
-        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
-        out_scales[0] = scale
         self.outputs = {
-            'Out': round_c(
-                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) *
-                (self.dtype(1.0) / scale) * self.inputs['X']),
-            'OutScale': scale,
-            'OutScales': out_scales,
+            'Out': output_data,
+            'OutScale': out_scale[0],
+            'OutScales': out_scale
         }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
+        self.dtype = dtype
+        self.attrs['is_test'] = is_test
         self.check_output()
 
-
-class TestFakeQuantizeRangeAbsMaxOpFloat16(TestFakeQuantizeRangeAbsMaxOp):
-    def set_dtype(self):
-        self.dtype = np.float16
+    def test_fake_quantize_range_abs_max(self):
+        dtype_options = [np.float32, np.float16]
+        is_test_options = [False, True]
+        for dtype, is_test in itertools.product(dtype_options, is_test_options):
+            self.attrs['bit_length'] = 8 if is_test else 5
+            with self.subTest(dtype=dtype, is_test=is_test):
+                self._fake_quantize_range_abs_max(
+                    dtype, (8, 16, 7, 7),
+                    lambda shape: (np.random.random(shape) - 0.5) * 10,
+                    is_test=is_test)
 
 
 class TestMovingAverageAbsMaxScaleOp(OpTest):
     def setUp(self):
-        self.op_type = "moving_average_abs_max_scale"
+        self.op_type = 'moving_average_abs_max_scale'
         self.attrs = {'moving_rate': float(0.9), 'is_test': False}
-        accum = np.zeros(1).astype("float32")
-        accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        x = np.random.random((8, 16, 7, 7)).astype("float32")
-        self.inputs = {
-            'X': x,
-            'InAccum': accum,
-            'InState': state,
-        }
 
-        out = x
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
+    def _moving_average_abs_max_scale(self, dtype, input_shape, distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        in_accum = np.ones(1).astype(dtype)
+        in_state = np.ones(1).astype(dtype)
+        out_accum = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(input_data))
+        out_state = self.attrs['moving_rate'] * in_state[0] + 1.0
         out_scale = out_accum / out_state
+        self.inputs = {
+            'X': input_data,
+            'InAccum': in_accum,
+            'InState': in_state
+        }
         self.outputs = {
-            'Out': out,
+            'Out': input_data,
             'OutAccum': out_accum,
             'OutState': out_state,
-            'OutScale': out_scale,
+            'OutScale': out_scale
         }
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
 
+    def test_moving_average_abs_max(self):
+        self._moving_average_abs_max_scale(np.float32, (8, 16, 7, 7),
+                                           np.random.random)
 
-class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
-    def setUp(self):
-        self.set_dtype()
-        self.op_type = "fake_quantize_range_abs_max"
-        self.attrs = {
-            'bit_length': int(8),
-            'window_size': int(1),
-            'is_test': True
-        }
-        x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype(self.dtype)
-        scale = np.array([np.max(np.abs(x)).astype(self.dtype) - 1.0])
-        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
-        out_scales[0] = scale.astype(self.dtype)
-        self.inputs = {
-            'X': x,
-            'Iter': np.zeros(1).astype("int64"),
-            'InScale': scale.astype(self.dtype)
-        }
-        xs = np.clip(x, -scale, scale).astype(self.dtype)
-        qs = round_c(
-            self.dtype(
-                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) * (
-                    self.dtype(1.0) / scale) * xs))
-        self.outputs = {
-            'Out': qs,
-            'OutScale': scale.astype(self.dtype),
-            'OutScales': out_scales,
-        }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output(no_check_set=set(['OutScale', 'OutScales']))
-
-
-class TestFakeQuantizeRangeAbsMaxOp2Float16(TestFakeQuantizeRangeAbsMaxOp2):
-    def set_dtype(self):
-        self.dtype = np.float16
 
-
-class TestMovingOpBase(OpTest):
+class TestFakeQuantizeMovingAverageAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_dtype()
-        self.init_type()
-        self.attrs = {
-            'bit_length': int(5),
-            'moving_rate': float(0.9),
-            'is_test': False
-        }
-        accum = np.zeros(1).astype(self.dtype)
-        accum[0] = 1
-        state = np.zeros(1).astype(self.dtype)
-        state[0] = self.dtype(1.0)
-        scale = np.zeros(1).astype(self.dtype)
-        scale[0] = 0.001
+        self.op_type = 'fake_quantize_moving_average_abs_max'
+        self.attrs = {'bit_length': 5, 'moving_rate': 0.9, 'is_test': False}
+
+    def _fake_quantize_moving_average_abs_max(self,
+                                              dtype,
+                                              input_shape,
+                                              distribution,
+                                              dequantize=False,
+                                              with_gradient=False):
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        in_accum = np.ones(1).astype(dtype)
+        in_state = np.ones(1).astype(dtype)
+        in_scale = np.array([0.001]).astype(dtype)
+        out_accum = np.zeros(1).astype(dtype)
+        out_state = np.zeros(1).astype(dtype)
+        out_scale = np.zeros(1).astype(dtype)
+        out_accum[0] = self.attrs['moving_rate'] * in_accum[0] + np.max(
+            np.abs(input_data))
+        out_state[0] = self.attrs['moving_rate'] * in_state[0] + 1.0
+        out_scale = out_accum / out_state
+        round_data = round_c(input_data.astype(compute_type) / out_scale * bnt)
+        if dequantize:
+            output_data = (round_data * out_scale / bnt).astype(dtype)
+            self.op_type = 'fake_quantize_dequantize_moving_average_abs_max'
+        else:
+            output_data = round_data.astype(dtype)
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype(self.dtype),
-            'InScale': scale,
-            'InAccum': accum,
-            'InState': state,
+            'X': input_data,
+            'InScale': in_scale,
+            'InAccum': in_accum,
+            'InState': in_state
         }
-
-        out_accum = np.zeros(1).astype(self.dtype)
-        out_state = np.zeros(1).astype(self.dtype)
-        out_scale = np.zeros(1).astype(self.dtype)
-        out_accum[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(accum[
-            0]) + np.max(np.abs(self.inputs['X'])).astype(self.dtype)
-        out_state[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(state[
-            0]) + self.dtype(1.0)
-        out_scale = self.dtype(self.dtype(out_accum) / self.dtype(out_state))
-        out_data = self.calc_output(out_scale)
         self.outputs = {
-            'Out': out_data,
+            'Out': output_data,
             'OutAccum': out_accum,
             'OutState': out_state,
-            'OutScale': out_scale,
+            'OutScale': out_scale
         }
-
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def init_type(self):
-        self.op_type = "fake_quantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        return round_c(self.inputs['X'] / out_scale * (
-            (1 << (self.attrs['bit_length'] - 1)) - 1))
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
+        if with_gradient:
+            gradient = [
+                np.ones(input_data.shape) / np.product(input_data.shape)
+            ]
+            self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
+    def test_fake_quantize_moving_average_abs_max(self):
+        self._fake_quantize_moving_average_abs_max(np.float32, (8, 16, 7, 7),
+                                                   np.random.random)
 
-class TestMovingOpBaseFloat16(TestMovingOpBase):
-    def set_dtype(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        self.check_output(atol=1e-2)
+    def test_fake_quantize_moving_average_abs_max_float16(self):
+        self._fake_quantize_moving_average_abs_max(np.float16, (8, 16, 7, 7),
+                                                   np.random.random)
 
+    def test_fake_quantize_dequantize_moving_average_abs_max(self):
+        self._fake_quantize_moving_average_abs_max(
+            np.float32, (8, 16, 7, 7),
+            np.random.random,
+            dequantize=True,
+            with_gradient=True)
 
-class TestFakeQuantDequantMovingOp(TestMovingOpBase):
-    def init_type(self):
-        self.op_type = "fake_quantize_dequantize_moving_average_abs_max"
 
-    def calc_output(self, out_scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / out_scale *
-                        range_v) * out_scale / range_v
-
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
-
-
-class TestFakeQuantDequantAbsOp(OpTest):
+class TestFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize_dequantize_abs_max"
+        self.op_type = 'fake_quantize_dequantize_abs_max'
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
-        out_data = self.calc_output(scale)
+
+    def _fake_quantize_dequantize_abs_max(self, dtype, input_shape,
+                                          distribution):
+        input_data = distribution(input_shape).astype(dtype)
+        scale = np.max(np.abs(input_data)).astype(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        output_data = round_c(input_data / scale * bnt) * scale / bnt
+        self.inputs = {'X': input_data}
         self.outputs = {
-            'Out': out_data,
-            'OutScale': np.array(scale).astype("float32"),
+            'Out': output_data,
+            'OutScale': np.array(scale).astype(dtype)
         }
-
-    def calc_output(self, scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / scale * range_v) * scale / range_v
-
-    def test_check_output(self):
+        self.dtype = dtype
         self.check_output()
+        gradient = [np.ones(input_data.shape) / np.product(input_data.shape)]
+        self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+    def test_fake_quantize_dequantize_abs_max(self):
+        self._fake_quantize_dequantize_abs_max(np.float32, (124, 240),
+                                               np.random.random)
 
 
-class TestChannelWiseFakeQuantDequantOp(OpTest):
+class TestChannelWiseFakeQuantizeDequantizeAbsMaxOp(OpTest):
     def setUp(self):
-        self.set_arg()
-        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
-
-        self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
-        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
-
-        scales = []
-        outputs = self.inputs['X'].copy()
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        if self.quant_axis == 0:
-            for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
-                scales.append(scale_v)
-                outputs[i] = np.round(outputs[i] * range_v /
-                                      scale_v) * scale_v / range_v
-        elif self.quant_axis == 1:
-            for i in range(self.inputs['X'].shape[1]):
-                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    "float32")
-                scales.append(scale_v)
-                outputs[:, i] = np.round(outputs[:, i] * range_v /
-                                         scale_v) * scale_v / range_v
-
-        self.outputs = {
-            'Out': outputs,
-            'OutScale': np.array(scales).astype("float32"),
-        }
-
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {
-            'X': np.random.random((3, 4, 64, 64)).astype("float32"),
-        }
+        self.op_type = 'fake_channel_wise_quantize_dequantize_abs_max'
+        self.attrs = {'bit_length': 8}
 
-    def test_check_output(self):
+    def _fake_channel_wise_quantize_dequantize_abs_max(
+            self, dtype, input_shape, quant_axis, distribution):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1.'
+        input_data = distribution(input_shape).astype(dtype)
+        compute_type = get_compute_type(dtype)
+        bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
+        output_data = input_data.copy().astype(compute_type)
+        compute_axis = tuple(
+            i for i in range(len(input_shape)) if i != quant_axis)
+        scale_broadcast = np.amax(input_data, axis=compute_axis, keepdims=True)
+        output_data = round_c(bnt * output_data /
+                              scale_broadcast) * scale_broadcast / bnt
+        if quant_axis == 1:
+            scale_broadcast = np.transpose(scale_broadcast,
+                                           (1, ) + compute_axis)
+        scale = scale_broadcast.reshape(input_shape[quant_axis], -1)[:, 0]
+        self.inputs = {'X': input_data}
+        self.outputs = {'Out': output_data, 'OutScale': scale}
+        self.dtype = dtype
+        self.attrs['quant_axis'] = quant_axis
         self.check_output()
+        gradient = [np.ones(input_data.shape) / np.product(input_data.shape)]
+        self.check_grad(['X'], 'Out', user_defined_grads=gradient)
 
-    def test_check_grad(self):
-        x = self.inputs["X"]
-        gradient = [np.ones(x.shape) / np.product(x.shape)]
-        self.check_grad(["X"], "Out", user_defined_grads=gradient)
-
-
-class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 1
-        self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
-        }
-
-
-class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
-
-
-class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
-    def set_arg(self):
-        self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+    def test_channel_wise_fake_quant_dequant_abs_max(self):
+        input_shape_quant_axis_options = [[(3, 4, 64, 64), 0], [(
+            15, 20, 5, 5), 1], [(30, 15), 0], [(30, 15), 1]]
+        for input_shape, quant_axis in input_shape_quant_axis_options:
+            with self.subTest(input_shape=input_shape, quant_axis=quant_axis):
+                self._fake_channel_wise_quantize_dequantize_abs_max(
+                    np.float32, input_shape, quant_axis, np.random.random)
 
 
 def quantize_max_abs(x, max_range):
@@ -589,5 +462,5 @@ def test_check_output(self):
         self.check_output()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     unittest.main()

From a4bb38cbb8b64bb36a40fd68b035c41adf20076f Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Tue, 31 May 2022 14:35:30 +0800
Subject: [PATCH 088/109] [EinsumOp] Make EinsumOp support bfloat16. (#43085)

* change einsum_v2 as default and add new flags: FLAG_einsum_opt=1|0

* make EInsumOP support bf16

* add unittest for BF16

* add condition for test_BF16

* fix bugs

* fix
---
 paddle/phi/kernels/funcs/eigen/broadcast.cc   |  2 ++
 paddle/phi/kernels/funcs/eigen/broadcast.cu   |  2 ++
 paddle/phi/kernels/gpu/einsum_grad_kernel.cu  |  3 +-
 paddle/phi/kernels/gpu/tile_kernel.cu         |  3 +-
 paddle/phi/kernels/impl/einsum_grad_impl.h    | 32 +++++++++++--------
 .../fluid/tests/unittests/test_einsum_v2.py   | 18 +++++++++++
 6 files changed, 44 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cc b/paddle/phi/kernels/funcs/eigen/broadcast.cc
index 3459d7acd6baf..008c51249f249 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cc
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
   template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu
index d9de69ec55e8b..742081a30c1a0 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cu
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
@@ -73,6 +74,7 @@ struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
   template struct FUNCTOR<Eigen::GpuDevice, T, 6>
 INSTANTIATION(EigenBroadcast, bool);
 INSTANTIATION(EigenBroadcast, dtype::float16);
+INSTANTIATION(EigenBroadcast, dtype::bfloat16);
 INSTANTIATION(EigenBroadcast, float);
 INSTANTIATION(EigenBroadcast, double);
 INSTANTIATION(EigenBroadcast, int);
diff --git a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
index 6ca8dbd9205d8..950f811475c99 100644
--- a/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/einsum_grad_kernel.cu
@@ -24,4 +24,5 @@ PD_REGISTER_KERNEL(einsum_grad,
                    phi::EinsumGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index 0c3c29e82c42a..990877a8445cb 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index aceb97a49b1c2..a72db326807f8 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -197,20 +197,24 @@ void EinsumGradKernel(const Context& dev_ctx,
     // release the cache tensor dTC to save memory right now. they are useless
     // now.
     cache.clear();
-    *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[0],
-                                                       ops[0],
-                                                       dA);
-    *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
-                                                       labeltype,
-                                                       labelshape,
-                                                       broadcast_dims,
-                                                       ellipsis_dims[1],
-                                                       ops[1],
-                                                       dB);
+    if (x_grad[0]) {
+      *(x_grad[0]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[0],
+                                                         ops[0],
+                                                         dA);
+    }
+    if (x_grad[1]) {
+      *(x_grad[1]) = PerformTileAndReduction<T, Context>(dev_ctx,
+                                                         labeltype,
+                                                         labelshape,
+                                                         broadcast_dims,
+                                                         ellipsis_dims[1],
+                                                         ops[1],
+                                                         dB);
+    }
   }
 }
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
index c58d46edde753..b33a943c9f27e 100644
--- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -478,5 +478,23 @@ def test_shape(self):
         self.assertEqual(C.shape, (-1, 384))
 
 
+class TestBF16(unittest.TestCase):
+    """
+    EinsumOp support bfloat16 type, add unittest here for the correctness.
+    """
+
+    def test_shape(self):
+        cuda_major = paddle.version.cuda().split('.')[0].strip()
+        if paddle.is_compiled_with_cuda() and int(cuda_major) >= 11:
+            """ MatmulKernel support bfloat16 only if cuda_major > 11.0.
+            """
+            A = paddle.to_tensor(np.array([1.0, 2.0])).astype(paddle.bfloat16)
+            A = A.cuda()
+            B = paddle.to_tensor(np.array([2.0, 3.0])).astype(paddle.bfloat16)
+            B = B.cuda()
+            C = paddle.einsum('i,i->', A, B)
+            self.assertEqual(C.item(), 8.0)
+
+
 if __name__ == "__main__":
     unittest.main()

From 6319dd830f5bfb1ab57a0584176ac83132f6b20a Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Tue, 31 May 2022 14:37:05 +0800
Subject: [PATCH 089/109] fix bugs (#43115)

---
 .../fleet/utils/hybrid_parallel_util.py         | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index d0b5c915e11cd..5e2ad43c16431 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -140,17 +140,12 @@ def broadcast_dp_parameters(model, hcg):
 
 
 def fused_allreduce_gradients(parameter_list, hcg):
-    if _in_legacy_dygraph():
-        data_parallel_group = None if hcg is None else hcg.get_data_parallel_group(
-        )
-        logger.debug("dp start fuse allreduce gradients")
-        with framework.no_grad():
-            _apply_collective_grads(parameter_list, data_parallel_group)
-    elif in_dygraph_mode():
-        assert hcg is None, "It's not support to use hcg in EagerDygraph now."
-        data_parallel_group = paddle.distributed.collective._get_default_group()
-        with framework.no_grad():
-            _apply_collective_grads_eager(parameter_list, data_parallel_group)
+    data_parallel_group = None if hcg is None else hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
+    apply_func = _apply_collective_grads_eager if in_dygraph_mode(
+    ) else _apply_collective_grads
+    with framework.no_grad():
+        apply_func(parameter_list, data_parallel_group)
 
 
 def sharding_reduce_gradients(parameter_list, hcg):

From 21e1d10f26b5e58139a75c2da067446fb4425e68 Mon Sep 17 00:00:00 2001
From: thunder95 <290844930@qq.com>
Date: Tue, 31 May 2022 14:44:32 +0800
Subject: [PATCH 090/109] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=202?=
 =?UTF-8?q?=E3=80=9116=20=E6=96=B0=E5=A2=9E=20API=20RRelu=20(#41823)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* rrelu逻辑部分

* unregistered op kernel (unresolved)

* commit before merge

* 丰富测试用例

* 修复rrelu-sig的bug

* 修复cpu环境测试

* 修改拼写错误

* 修改code format

* 尝试优化测试用例timeout的问题

* 优化测试用例

* 移除seed, 优化随机函数

* update en doc for rrelu

* fix rrelu en docs, test=document_fix

* add paper link for en docs, test=document_fix

* udpate en doc

* add r,test=document_fix
---
 paddle/fluid/operators/rrelu_op.cc            | 126 +++++++
 paddle/phi/infermeta/unary.cc                 |  49 +++
 paddle/phi/infermeta/unary.h                  |  11 +
 paddle/phi/kernels/cpu/rrelu_grad_kernel.cc   |  44 +++
 paddle/phi/kernels/cpu/rrelu_kernel.cc        |  77 +++++
 paddle/phi/kernels/gpu/rrelu_grad_kernel.cu   |  86 +++++
 paddle/phi/kernels/gpu/rrelu_kernel.cu        | 112 ++++++
 paddle/phi/kernels/rrelu_grad_kernel.h        |  28 ++
 paddle/phi/kernels/rrelu_kernel.h             |  29 ++
 paddle/phi/ops/compat/rrelu_sig.cc            |  32 ++
 .../fluid/tests/unittests/test_rrelu_op.py    | 326 ++++++++++++++++++
 python/paddle/nn/__init__.py                  |   2 +
 python/paddle/nn/functional/__init__.py       |   2 +
 python/paddle/nn/functional/activation.py     | 116 +++++++
 python/paddle/nn/layer/__init__.py            |   1 +
 python/paddle/nn/layer/activation.py          |  87 +++++
 tools/static_mode_white_list.py               |   1 +
 17 files changed, 1129 insertions(+)
 create mode 100644 paddle/fluid/operators/rrelu_op.cc
 create mode 100644 paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/rrelu_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/rrelu_kernel.cu
 create mode 100644 paddle/phi/kernels/rrelu_grad_kernel.h
 create mode 100644 paddle/phi/kernels/rrelu_kernel.h
 create mode 100644 paddle/phi/ops/compat/rrelu_sig.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_rrelu_op.py

diff --git a/paddle/fluid/operators/rrelu_op.cc b/paddle/fluid/operators/rrelu_op.cc
new file mode 100644
index 0000000000000..c543a088e9d7f
--- /dev/null
+++ b/paddle/fluid/operators/rrelu_op.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class RReluOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class RReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input of RReLU op.");
+    AddOutput("Out", "The output of RReLU op.");
+    AddOutput("Noise", "The random sampled RReLU noise.")
+        .AsIntermediate()
+        .AsExtra();
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    float default_lower = 1. / 8.;
+    AddAttr<float>("lower", "Lower bound of the uniform distribution.")
+        .SetDefault(default_lower)
+        .AddCustomChecker([](const float& lower) {
+          PADDLE_ENFORCE_EQ(lower >= 0.0f && lower < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'RRelu_lower' must be between 0.0 and 1.0."));
+        });
+    float defalut_upper = 1. / 3.;
+    AddAttr<float>("upper", "Upper bound of the uniform distribution.")
+        .SetDefault(defalut_upper)
+        .AddCustomChecker([](const float& upper) {
+          PADDLE_ENFORCE_EQ(upper > 0.0f && upper <= 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'RRelu_upper' must be between 0.0 and 1.0."));
+        });
+    AddComment(R"DOC(
+RReLU Operator.
+
+Applies the randomized leaky rectified liner unit function, element-wise,
+as described in the paper:
+
+`Empirical Evaluation of Rectified Activations in Convolutional Network`_.
+
+The function is defined as:
+
+.. math::
+    \text{RReLU}(x) =
+    \begin{cases}
+        x & \text{if } x \geq 0 \\
+        ax & \text{ otherwise }
+    \end{cases}
+
+where :math:`a` is randomly sampled from uniform distribution
+:math:`\mathcal{U}(\text{lower}, \text{upper})`.
+
+ See: https://arxiv.org/pdf/1505.00853.pdf
+
+)DOC");
+  }
+};
+
+class RReluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+template <typename T>
+class RReluGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("rrelu_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Noise", this->Output("Noise"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(rrelu, RReluInferShapeFunctor,
+                            PD_INFER_META(phi::RReluInferMeta));
+
+REGISTER_OPERATOR(rrelu, ops::RReluOp, ops::RReluOpMaker,
+                  ops::RReluGradOpMaker<paddle::framework::OpDesc>,
+                  ops::RReluGradOpMaker<paddle::imperative::OpBase>,
+                  RReluInferShapeFunctor);
+
+DECLARE_INFER_SHAPE_FUNCTOR(rrelu_grad, RReluGradInferShapeFunctor,
+                            PD_INFER_META(phi::RReluGradInferMeta));
+REGISTER_OPERATOR(rrelu_grad, ops::RReluGradOp, RReluGradInferShapeFunctor);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index f736bf50162d8..0beb7223f212a 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1977,6 +1977,55 @@ void RollInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void RReluInferMeta(const MetaTensor& x,
+                    float lower,
+                    float upper,
+                    bool is_test,
+                    MetaTensor* out,
+                    MetaTensor* noise) {
+  auto x_dims = x.dims();
+  PADDLE_ENFORCE_GE(lower,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The lower value should be greater than or equal to 0. "
+                        "But received lower value = %f.",
+                        lower));
+  PADDLE_ENFORCE_LE(upper,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The upper value should be less than or equal to 1. "
+                        "But received upper value = %f.",
+                        upper));
+  PADDLE_ENFORCE_GE(
+      upper,
+      lower,
+      phi::errors::InvalidArgument(
+          "The upper value should be greater than or equal to lower value "
+          "But received upper value = %f, lower value = %f.",
+          upper,
+          lower));
+
+  out->set_dims(x_dims);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+  out->share_lod(x);
+
+  if (noise != nullptr) {
+    noise->set_dims(x_dims);
+    noise->set_dtype(x.dtype());
+    noise->set_layout(x.layout());
+  }
+}
+
+void RReluGradInferMeta(const MetaTensor& out_grad,
+                        const MetaTensor& noise,
+                        MetaTensor* x_grad) {
+  auto do_dims = out_grad.dims();
+  x_grad->set_dims(do_dims);
+  x_grad->set_dtype(out_grad.dtype());
+  x_grad->share_lod(out_grad);
+}
+
 void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) {
   auto in_dims = x.dims();
   PADDLE_ENFORCE_LT(
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index c21ef0e2d1103..a288b9371016f 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -281,6 +281,17 @@ void RollInferMeta(const MetaTensor& x,
                    const std::vector<int64_t>& axis,
                    MetaTensor* out);
 
+void RReluInferMeta(const MetaTensor& x,
+                    float lower,
+                    float upper,
+                    bool is_test,
+                    MetaTensor* out,
+                    MetaTensor* noise);
+
+void RReluGradInferMeta(const MetaTensor& out_grad,
+                        const MetaTensor& noise,
+                        MetaTensor* x_grad);
+
 void SetValueInferMeta(const MetaTensor& x, MetaTensor* out);
 
 void ShapeInferMeta(const MetaTensor& input, MetaTensor* out);
diff --git a/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc b/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
new file mode 100644
index 0000000000000..10b6c6b1a3ea8
--- /dev/null
+++ b/paddle/phi/kernels/cpu/rrelu_grad_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad) {
+  const T* n_ptr = noise.data<T>();
+  const T* x_ptr = x.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  int numel = x.numel();
+  if (!x_grad) return;
+
+  int i = 0;
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+  for (i = 0; i < numel; i++) {
+    x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] : n_ptr[i] * out_grad_ptr[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    rrelu_grad, CPU, ALL_LAYOUT, phi::RReluGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/rrelu_kernel.cc b/paddle/phi/kernels/cpu/rrelu_kernel.cc
new file mode 100644
index 0000000000000..4c6e30beddfa3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/rrelu_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_kernel.h"
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise) {
+  const T* x_ptr = x.data<T>();
+  T* o_ptr = dev_ctx.template Alloc<T>(out);
+  T* n_ptr = dev_ctx.template Alloc<T>(noise);
+  T zero = static_cast<T>(0);
+  int numel = x.numel();
+  int i = 0;
+
+  if (is_test) {
+    T mid_val = static_cast<T>((lower + upper) / 2.0);
+    for (i = 0; i < numel; i++) {
+      if (x_ptr[i] < zero) {
+        o_ptr[i] = mid_val * x_ptr[i];
+        n_ptr[i] = mid_val;
+      } else {
+        o_ptr[i] = x_ptr[i];
+        n_ptr[i] = 1.0;
+      }
+    }
+
+    return;
+  }
+
+  auto engine = paddle::framework::GetCPURandomEngine(0);
+
+  std::uniform_real_distribution<float> dist(lower, upper);
+
+  for (i = 0; i < numel; i++) {
+    if (x_ptr[i] < zero) {
+      T scale = static_cast<T>(dist(*engine));
+      o_ptr[i] = scale * x_ptr[i];
+      n_ptr[i] = scale;
+    } else {
+      o_ptr[i] = x_ptr[i];
+      n_ptr[i] = 1.0;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::RReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
new file mode 100644
index 0000000000000..44dc31ed5d926
--- /dev/null
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/rrelu_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/gpu/prelu_funcs.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void RReluOpGradKernel(const T* x_ptr,
+                                  const T* noise_ptr,
+                                  const T* out_grad_ptr,
+                                  T* x_grad_ptr,
+                                  int numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    T scale = noise_ptr[index];
+    T x = x_ptr[index];
+    T out_grad = out_grad_ptr[index];
+    T zero = static_cast<T>(0);
+    x_grad_ptr[index] = (x < zero) ? scale * out_grad : out_grad;
+  }
+}
+
+template <typename T>
+class RReluOpGradFunctor {
+ public:
+  void operator()(gpuStream_t stream,
+                  const T* x,
+                  const T* noise,
+                  const T* out_grad,
+                  T* x_grad,
+                  int numel) {
+    RReluOpGradKernel<
+        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
+        x, noise, out_grad, x_grad, numel);
+  }
+};
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad) {
+  if (!x_grad) return;
+  dev_ctx.template Alloc<T>(x_grad);
+
+  const T* x_ptr = x.data<T>();
+  const T* n_ptr = noise.data<T>();
+  const T* out_grad_ptr = out_grad.data<T>();
+  T* x_grad_ptr = dev_ctx.template Alloc<T>(x_grad);
+
+  int numel = x.numel();
+  auto stream = dev_ctx.stream();
+
+  RReluOpGradFunctor<T> rrelu_grad;
+  rrelu_grad(stream, x_ptr, n_ptr, out_grad_ptr, x_grad_ptr, numel);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RReluGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/rrelu_kernel.cu b/paddle/phi/kernels/gpu/rrelu_kernel.cu
new file mode 100644
index 0000000000000..39582d5872a70
--- /dev/null
+++ b/paddle/phi/kernels/gpu/rrelu_kernel.cu
@@ -0,0 +1,112 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/rrelu_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct RReluTrainCudaFunctor {
+ public:
+  RReluTrainCudaFunctor(const T* in, T* out, T* noise)
+      : in_(in), out_(out), noise_(noise) {
+    zero_ = static_cast<T>(0);
+  }
+
+  __device__ void operator()(int64_t idx) {
+    T x = in_[idx];
+    if (x < zero_) {
+      out_[idx] = noise_[idx] * x;
+    } else {
+      out_[idx] = x;
+      noise_[idx] = 1.0;
+    }
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  T* noise_;
+  T zero_;
+};
+
+template <typename T>
+struct RReluTestCudaFunctor {
+ public:
+  RReluTestCudaFunctor(const T* in, T* out, T* noise, T mid_val)
+      : in_(in), out_(out), noise_(noise), mid_val_(mid_val) {
+    zero_ = static_cast<T>(0);
+  }
+
+  __device__ void operator()(int64_t idx) {
+    T x = in_[idx];
+    if (x < zero_) {
+      out_[idx] = mid_val_ * x;
+      noise_[idx] = mid_val_;
+    } else {
+      out_[idx] = x;
+      noise_[idx] = 1.0;
+    }
+  }
+
+ private:
+  const T* in_;
+  T* out_;
+  T* noise_;
+  T zero_;
+  T mid_val_;
+};
+
+template <typename T, typename Context>
+void RReluKernel(const Context& ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise) {
+  const T* x_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  T* noise_data = ctx.template Alloc<T>(noise);
+  auto size = x.numel();
+  if (size <= 0) return;
+
+  phi::funcs::ForRange<Context> for_range(ctx, size);
+  if (is_test) {
+    T mid_val = static_cast<T>((lower + upper) / 2.0);
+    RReluTestCudaFunctor<T> functor(x_data, out_data, noise_data, mid_val);
+    for_range(functor);
+  } else {
+    using MT = typename kps::details::MPTypeTrait<T>::Type;
+    funcs::uniform_distribution<MT> dist;
+    funcs::uniform_real_transform<MT> trans(lower, upper);
+    funcs::distribution_and_transform<T>(ctx, noise, dist, trans);
+    RReluTrainCudaFunctor<T> functor(x_data, out_data, noise_data);
+    for_range(functor);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(rrelu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::RReluKernel,
+                   float,
+                   phi::dtype::float16,
+                   double) {}
diff --git a/paddle/phi/kernels/rrelu_grad_kernel.h b/paddle/phi/kernels/rrelu_grad_kernel.h
new file mode 100644
index 0000000000000..b6172fca10e53
--- /dev/null
+++ b/paddle/phi/kernels/rrelu_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& noise,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad);
+}  // namespace phi
diff --git a/paddle/phi/kernels/rrelu_kernel.h b/paddle/phi/kernels/rrelu_kernel.h
new file mode 100644
index 0000000000000..8deb52daaae13
--- /dev/null
+++ b/paddle/phi/kernels/rrelu_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RReluKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const float lower,
+                 const float upper,
+                 bool is_test,
+                 DenseTensor* out,
+                 DenseTensor* noise);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/rrelu_sig.cc b/paddle/phi/ops/compat/rrelu_sig.cc
new file mode 100644
index 0000000000000..00cd705a24076
--- /dev/null
+++ b/paddle/phi/ops/compat/rrelu_sig.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature RReluOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rrelu", {"X"}, {"lower", "upper", "is_test"}, {"Out", "Noise"});
+}
+
+KernelSignature RReluGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "rrelu_grad", {"X", "Noise", "Out@GRAD"}, {}, {"X@GRAD"});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(rrelu, phi::RReluOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(rrelu_grad, phi::RReluGradGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_rrelu_op.py b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
new file mode 100644
index 0000000000000..9d33ce085b7f7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -0,0 +1,326 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from op_test import OpTest
+import paddle
+import paddle.nn.functional as F
+from paddle.fluid import dygraph
+
+paddle.seed(102)
+np.random.seed(102)
+
+
+def ref_rrelu(x, lower, upper):
+    x_t = x.copy()
+    alpha = (lower + upper) / 2.0
+    return np.where(x_t <= 0, alpha * x_t, x_t)
+
+
+def ref_rrelu_nn(x, lower, upper):
+    return ref_rrelu(x, lower, upper)
+
+
+def check_output(input, output, lower, upper):
+    lower_res = np.where(input <= 0, lower * input, input)
+    upper_res = np.where(input <= 0, upper * input, input)
+    return (output <= lower_res).all() and (output >= upper_res).all()
+
+
+class TestFunctionalRReluAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = np.random.uniform(-1., 1., [1, 2, 3, 4]).astype('float64')
+        self.lower_0 = 0.05
+        self.lower_1 = 0.1
+        self.upper_0 = 0.25
+        self.upper_1 = 0.33
+
+        self.places = [
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        ]
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(
+                name="input", shape=[2, 3, 4, 5], dtype="float32")
+            res1 = F.rrelu(
+                x=input, lower=self.lower_0, upper=self.upper_0, training=False)
+            res2 = F.rrelu(
+                x=input, lower=self.lower_1, upper=self.upper_1, training=False)
+            in_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype("float32")
+
+            res_np1 = ref_rrelu(in_np, self.lower_0, self.upper_0)
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res1])
+
+            self.assertTrue(np.allclose(fetches[0], res_np1))
+
+            res_np2 = ref_rrelu(in_np, self.lower_1, self.upper_1)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res2])
+            self.assertTrue(np.allclose(fetches[0], res_np2))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_static_graph_functional(self):
+        '''test_static_graph_functional'''
+
+        for place in self.places:
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=self.x_np.shape, dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=self.x_np.shape, dtype="float64")
+            out_1 = F.rrelu(x_1, self.lower_0, self.upper_0, training=False)
+            out_2 = F.rrelu(x_2, self.lower_1, self.upper_1, training=False)
+            out_3 = F.rrelu(x_2, self.lower_1, self.upper_1, training=True)
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+            res_3 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_3,
+                            use_prune=True)
+
+            out_ref_1 = ref_rrelu(self.x_np, self.lower_0, self.upper_0)
+            out_ref_2 = ref_rrelu(self.x_np, self.lower_1, self.upper_1)
+            self.assertEqual(np.allclose(out_ref_1, res_1), True)
+            self.assertEqual(np.allclose(out_ref_2, res_2), True)
+            self.assertTrue(
+                check_output(self.x_np, res_3[0], self.lower_1, self.upper_1))
+
+    def test_static_graph_layer(self):
+        '''test_static_graph_layer'''
+
+        for place in self.places:
+            paddle.enable_static()
+            x_1 = paddle.fluid.data(
+                name="x", shape=self.x_np.shape, dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=self.x_np.shape, dtype="float64")
+            # init instance
+            rrelu_1 = paddle.nn.RReLU(self.lower_0, self.upper_0)
+            rrelu_2 = paddle.nn.RReLU(self.lower_1, self.upper_1)
+            out_1 = rrelu_1(x_1)
+            out_2 = rrelu_2(x_2)
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(fluid.default_main_program(),
+                            feed={"x": self.x_np},
+                            fetch_list=out_1,
+                            use_prune=True)
+            res_2 = exe.run(fluid.default_main_program(),
+                            feed={"x2": self.x_np},
+                            fetch_list=out_2,
+                            use_prune=True)
+
+            self.assertTrue(
+                check_output(self.x_np, res_1[0], self.lower_0, self.upper_0))
+            self.assertTrue(
+                check_output(self.x_np, res_2[0], self.lower_1, self.upper_1))
+
+    def dygraph_check(self, lower, upper):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self.x_np)
+            out = F.rrelu(x, lower, upper, training=False)
+            out_ref = ref_rrelu(self.x_np, lower, upper)
+            self.assertEqual(np.allclose(out_ref, out), True)
+            paddle.enable_static()
+
+    def test_dygraph_functional(self):
+        '''test_dygraph_functional'''
+
+        self.dygraph_check(self.lower_0, self.upper_0)
+        self.dygraph_check(self.lower_1, self.upper_1)
+
+    def test_dygraph_layer(self):
+        '''test_dygraph_layer'''
+
+        for place in self.places:
+            paddle.disable_static(place=place)
+            rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
+            result = rrelu(paddle.to_tensor(self.x_np))
+            self.assertTrue(
+                check_output(self.x_np,
+                             result.numpy(), self.lower_0, self.upper_0))
+            paddle.enable_static()
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place=place)
+            with dygraph.guard():
+                rrelu = paddle.nn.RReLU(self.lower_0, self.upper_0)
+                out_np = rrelu(paddle.to_tensor(self.x_np))
+            self.assertTrue(
+                check_output(self.x_np,
+                             out_np.numpy(), self.lower_0, self.upper_0))
+            paddle.enable_static()
+
+    def test_error_functional(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(
+                TypeError, F.rrelu, x=1, lower=self.lower_0, upper=self.upper_0)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[2, 3], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                F.rrelu,
+                x=x_int32,
+                lower=self.lower_0,
+                upper=self.upper_0)
+            x_bool = paddle.fluid.data(
+                name='x_bool', shape=[2, 3], dtype='int32')
+            self.assertRaises(
+                TypeError,
+                F.rrelu,
+                x=x_bool,
+                lower=self.lower_0,
+                upper=self.upper_0)
+            # lower and upper must be float
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[2, 3], dtype='float32')
+            self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0, upper=0.5)
+            self.assertRaises(TypeError, F.rrelu, x=x_fp32, lower=0.5, upper=1)
+            # lower and upper must be in (0, 1)
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=-1., upper=0.5)
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=2.)
+            # upper should not be less than lower
+            self.assertRaises(
+                ValueError, F.rrelu, x=x_fp32, lower=0.5, upper=0.2)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[2, 3], dtype='float16')
+            F.rrelu(x=x_fp16, lower=self.lower_0, upper=self.upper_0)
+
+    def test_error_layer(self):
+        def error_int_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float64")
+                rrelu = paddle.nn.RReLU(2, 3)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0, 0.5)
+                rrelu(paddle.to_tensor(x))
+
+        def error_upper_dtype():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 1)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_range():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(-1.0, 0.5)
+                rrelu(paddle.to_tensor(x))
+
+        def error_upper_range():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 2.0)
+                rrelu(paddle.to_tensor(x))
+
+        def error_lower_upper():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 3]).astype("float32")
+                rrelu = paddle.nn.RReLU(0.5, 0.2)
+                rrelu(paddle.to_tensor(x))
+
+        self.assertRaises(TypeError, error_int_dtype)
+        self.assertRaises(TypeError, error_lower_dtype)
+        self.assertRaises(TypeError, error_upper_dtype)
+        self.assertRaises(ValueError, error_lower_range)
+        self.assertRaises(ValueError, error_upper_range)
+        self.assertRaises(ValueError, error_lower_upper)
+
+
+class RReluTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.1
+        self.upper = 0.3
+        self.is_test = True
+        self.init_prams()
+
+    def init_prams(self):
+        self.dtype = "float64"
+        self.x_shape = [2, 3, 4, 5]
+
+        x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
+        out_np = ref_rrelu(x_np, self.lower, self.upper)
+        noise_np = np.ones(self.x_shape).astype(self.dtype)
+        noise_np[x_np < 0] = (self.lower + self.upper) / 2.0
+
+        self.inputs = {'X': x_np}
+        self.outputs = {'Out': out_np, 'Noise': noise_np}
+        self.attrs = {
+            'lower': self.lower,
+            "upper": self.upper,
+            "is_test": self.is_test
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class RReluTrainingTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.3
+        self.upper = 0.3000009
+        self.is_test = False
+        self.init_prams()
+
+
+class RReluTrainingTest(OpTest):
+    def setUp(self):
+        self.op_type = "rrelu"
+        self.lower = 0.3
+        self.upper = 0.3000009
+        self.is_test = False
+        self.init_prams()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index bceee4b964a33..b4be291b0697f 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -51,6 +51,7 @@
 from .layer.activation import ThresholdedReLU  # noqa: F401
 from .layer.activation import LogSoftmax  # noqa: F401
 from .layer.activation import Maxout  # noqa: F401
+from .layer.activation import RReLU  # noqa: F401
 from .layer.common import Pad1D  # noqa: F401
 from .layer.common import Pad2D  # noqa: F401
 from .layer.common import ZeroPad2D  # noqa: F401
@@ -313,4 +314,5 @@ def weight_norm(*args):
            'MaxUnPool3D',
            'HingeEmbeddingLoss',
            'Identity',
+           'RReLU',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 68213d831c550..44acf32894588 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -47,6 +47,7 @@
 from .activation import log_softmax  # noqa: F401
 from .activation import glu  # noqa: F401
 from .activation import gumbel_softmax  # noqa: F401
+from .activation import rrelu  # noqa: F401
 from .common import dropout  # noqa: F401
 from .common import dropout2d  # noqa: F401
 from .common import dropout3d  # noqa: F401
@@ -228,4 +229,5 @@
            'class_center_sample',
            'sparse_attention',
            'fold',
+           'rrelu',
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 6970cf4962909..0dcc43565f25a 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -550,6 +550,122 @@ def prelu(x, weight, data_format="NCHW", name=None):
     return out
 
 
+def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
+    r"""
+    rrelu activation.
+
+    Applies the randomized leaky rectified liner unit function to improve generalization performance,
+    as described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_
+
+    During training, randomly samples the negative slope for activation values as described below:
+
+    .. math::
+
+        rrelu(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    a * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`a` is randomly sampled from uniform distribution in range (:math:`lower`, :math:`upper`),
+
+    In the test phase, the negative slope will take the average value of :math:`lower` and :math:`upper`:
+
+    .. math::
+
+        rrelu(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    (lower + upper) * 0.5 * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`lower` and :math:`upper` are the bounds of uniform distribution.
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float16, float32, float64.
+        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
+        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
+        training (bool, optional): Current mode is in training or others.  Default is True.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+            :name: rrelu-example
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+                                            [ 3.0, -4.0,  5.0, -6.0],
+                                            [-7.0, -8.0,  8.0,  9.0]],
+                                            [[ 1.0, -2.0, -3.0,  4.0],
+                                            [-5.0,  6.0,  7.0, -8.0],
+                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            out = F.rrelu(input_tensor, 0.1, 0.3)
+            #[[[[-0.20000899  3.         -0.8810822   5.        ]
+            #   [ 3.         -0.55175185  5.         -1.0776101 ]
+            #   [-1.0680687  -1.9896201   8.          9.        ]]
+            #  [[ 1.         -0.5238267  -0.65515125  4.        ]
+            #   [-1.3766339   6.          7.         -2.3465784 ]
+            #   [ 6.          7.          8.          9.        ]]]]
+    """
+
+    if not in_dynamic_mode():
+        check_variable_and_dtype(x, 'X', ['float16', 'float32', 'float64'],
+                                 'rrelu')
+
+    if not isinstance(lower, float) or not isinstance(upper, float):
+        raise TypeError(
+            "The lower and upper values must be float type. Received: lower {}, upper {}.".
+            format(lower, upper))
+
+    if lower < 0 or lower > 1:
+        raise ValueError(
+            "The lower value must be no less than zero or greater than one. Received: {}.".
+            format(lower))
+
+    if upper < lower:
+        raise ValueError(
+            "The upper value must be greater than lower value. Received: lower {}, upper {}.".
+            format(lower, upper))
+
+    if upper > 1:
+        raise ValueError(
+            "The upper value must be no greater than one. Received: {}.".format(
+                upper))
+
+    is_test = not training
+
+    if _in_legacy_dygraph():
+        out, noise = _C_ops.rrelu(x, 'lower', lower, 'upper', upper, 'is_test',
+                                  is_test)
+        return out
+
+    helper = LayerHelper('rrelu', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    noise = helper.create_variable_for_type_inference(dtype=x.dtype)
+    attrs = {'lower': lower, 'upper': upper, 'is_test': is_test}
+    helper.append_op(
+        type='rrelu',
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Noise": noise},
+        attrs=attrs)
+    return out
+
+
 def relu(x, name=None):
     """
     relu activation.
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 31364f0281c8a..cca8c37645df6 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -26,6 +26,7 @@
 from .activation import Sigmoid  # noqa: F401
 from .activation import Softmax  # noqa: F401
 from .activation import LogSoftmax  # noqa: F401
+from .activation import RReLU  # noqa: F401
 from .activation import Softmax2D  # noqa: F401
 from .common import Bilinear  # noqa: F401
 from .common import Pad1D  # noqa: F401
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 7fd109843bede..1a3768e919042 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -436,6 +436,93 @@ def extra_repr(self):
             name_str)
 
 
+class RReLU(Layer):
+    r"""
+    RReLU activation layer.
+
+    Applies the randomized leaky rectified liner unit function to improve generalization performance,
+    as described in the paper:
+    `Empirical Evaluation of Rectified Activations in Convolutional Network <https://arxiv.org/abs/1505.00853>`_
+
+    During training, randomly samples the negative slope for activation values as described below:
+
+    .. math::
+
+        RReLU(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    a * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`a` is randomly sampled from uniform distribution in range (:math:`lower`, :math:`upper`),
+
+    In the test phase, the negative slope will take the average value of :math:`lower` and :math:`upper`:
+
+    .. math::
+
+        RReLU(x)=
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    (lower + upper) * 0.5 * x, & & otherwise \\
+                \end{array}
+            \right.
+
+    where :math:`x` is the input tensor,
+    :math:`lower` and :math:`upper` are the bounds of uniform distribution.
+
+    Parameters:
+        lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
+        upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape. Default dtype is float32.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+            :name: RReLU-example
+
+            import paddle
+
+            input_tensor = paddle.to_tensor([[[[-2.0,  3.0, -4.0,  5.0],
+                                            [ 3.0, -4.0,  5.0, -6.0],
+                                            [-7.0, -8.0,  8.0,  9.0]],
+                                            [[ 1.0, -2.0, -3.0,  4.0],
+                                            [-5.0,  6.0,  7.0, -8.0],
+                                            [ 6.0,  7.0,  8.0,  9.0]]]], dtype='float32')
+
+            rrelu_layer = paddle.nn.RReLU(0.1, 0.3)
+            output = rrelu_layer(input_tensor)
+            #[[[[-0.20000899  3.         -0.88108218  5.        ]
+            #   [ 3.         -0.55175185  5.         -1.07761011]
+            #   [-1.06806871 -1.98962009  8.          9.        ]]
+            #  [[ 1.         -0.52382672 -0.65515128  4.        ]
+            #   [-1.37663394  6.          7.         -2.34657836]
+            #   [ 6.          7.          8.          9.        ]]]]
+    """
+
+    def __init__(self, lower=1. / 8., upper=1. / 3., name=None):
+        super(RReLU, self).__init__()
+        self._lower = lower
+        self._upper = upper
+        self._name = name
+
+    def forward(self, x):
+        return F.rrelu(
+            x, lower=self._lower, upper=self._upper, training=self.training)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self._name) if self._name else ''
+        return 'lower={}, upper={}, training={}, dtype={}{}'.format(
+            self._lower, self._upper, self.training, self._dtype, name_str)
+
+
 class ReLU(Layer):
     """
     ReLU Activation.
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 6067b40f0a7c1..95c5ecf713112 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -399,6 +399,7 @@
     'test_positive_negative_pair_op',
     'test_precision_recall_op',
     'test_prelu_op',
+    'test_rrelu_op',
     'test_prelu_mkldnn_op',
     'test_print_op',
     'test_prior_box_op',

From f9e55dee9cc1c7cac70bd87200d228aec931deea Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Tue, 31 May 2022 14:52:36 +0800
Subject: [PATCH 091/109] [NPU] fix arg_max and reduce_max (#42887)

* fix arg_max and reduce_max

* add arg_max ut
---
 paddle/fluid/operators/arg_max_op_npu.cc      |  9 ++++++-
 .../operators/reduce_ops/reduce_max_op_npu.cc | 24 +++++++++++++++--
 .../unittests/npu/test_arg_max_op_npu.py      | 27 +++++++++++++++++++
 3 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc
index 680183b6adf40..5c6b276c0172a 100644
--- a/paddle/fluid/operators/arg_max_op_npu.cc
+++ b/paddle/fluid/operators/arg_max_op_npu.cc
@@ -34,11 +34,18 @@ struct VisitDataArgNPUMaxFunctor {
     out.template mutable_data<Tout>(ctx.GetPlace());
     auto axis = ctx.Attr<int64_t>("axis");
     auto dtype = ctx.Attr<int>("dtype");
+    const bool& flatten = ctx.Attr<bool>("flatten");
+
+    Tensor transformed_x(x.type());
+    transformed_x.ShareDataWith(x);
+    if (flatten) {
+      transformed_x.Resize(phi::make_ddim({x.numel()}));
+    }
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     NpuOpRunner runner;
     runner.SetType("ArgMaxV2")
-        .AddInput(x)
+        .AddInput(transformed_x)
         .AddInput(std::vector<int64_t>{axis})
         .AddOutput(out)
         .AddAttrDataType("dtype", dtype)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index 04660fb501142..e3d8d15a305a9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -112,6 +112,8 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Input<Tensor>("Out");
     auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
     int in_dtype = context.Attr<int>("in_dtype");
 
     PADDLE_ENFORCE_EQ(
@@ -129,12 +131,30 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
 
     // broadcast
     auto x_dims_vec = phi::vectorize(x->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+
+    Tensor tmp_out, tmp_out_grad;
+    auto tmp_out_dims_vec = x_dims_vec;
+    for (auto d : reduce_dims) {
+      tmp_out_dims_vec[d] = 1;
+    }
+
+    tmp_out.ShareDataWith(*out);
+    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
+    tmp_out_grad.ShareDataWith(*out_grad);
+    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
+
     Tensor transformed_out(x->type());
     transformed_out.Resize(phi::make_ddim(x_dims_vec));
     transformed_out.mutable_data<T>(place);
     NpuOpRunner r_brd_out;
     r_brd_out.SetType("BroadcastTo")
-        .AddInput(*out)
+        .AddInput(tmp_out)
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out)
         .Run(stream);
@@ -143,7 +163,7 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     transformed_out_grad.mutable_data<T>(place);
     NpuOpRunner r_brd_out_grad;
     r_brd_out_grad.SetType("BroadcastTo")
-        .AddInput(*out_grad)
+        .AddInput(tmp_out_grad)
         .AddInput(std::move(x_dims_vec))
         .AddOutput(transformed_out_grad)
         .Run(stream);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
index 85ade1179b7d6..c6135383721e1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_arg_max_op_npu.py
@@ -328,5 +328,32 @@ def run(place):
             run(place)
 
 
+class TestArgMaxAPI_3(unittest.TestCase):
+    def initTestCase(self):
+        self.dims = (1, 9)
+        self.dtype = 'float32'
+
+    def setUp(self):
+        self.initTestCase()
+        self.__class__.use_npu = True
+        self.place = [paddle.NPUPlace(0)]
+
+    def test_dygraph_api(self):
+        def run(place):
+            paddle.disable_static(place)
+            np.random.seed(2021)
+            numpy_input = (np.random.random(self.dims)).astype(self.dtype)
+            tensor_input = paddle.to_tensor(numpy_input)
+            numpy_output = np.argmax(numpy_input).reshape([1])
+            paddle_output = paddle.argmax(tensor_input)
+            self.assertEqual(
+                np.allclose(numpy_output, paddle_output.numpy()), True)
+            self.assertEqual(numpy_output.shape, paddle_output.numpy().shape)
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
 if __name__ == '__main__':
     unittest.main()

From e9589e354fd90965272bc5fed18303037179f3bc Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 31 May 2022 15:26:28 +0800
Subject: [PATCH 092/109] [Eager] Polish append op using for model perf
 (#43102)

* polish append op using

* fix var error

* fix group norm impl
---
 paddle/fluid/pybind/op_function_generator.h |  6 ++++
 python/paddle/fluid/dygraph/nn.py           | 14 +++++----
 python/paddle/fluid/framework.py            |  4 +++
 python/paddle/fluid/layers/nn.py            | 34 +++++++++++++++++++--
 python/paddle/fluid/layers/tensor.py        | 21 ++++++++-----
 python/paddle/nn/layer/norm.py              | 25 ++++++++++++---
 python/paddle/tensor/creation.py            | 21 ++++++++-----
 7 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index bc84863d7d607..972e8aafab758 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -125,6 +125,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
     {"inplace_abn",
      {"X", "Scale", "Bias", "Mean", "Variance", "MomentumTensor"}},
+    {"linear_interp", {"X", "OutSize"}},
+    {"bilinear_interp", {"X", "OutSize"}},
+    {"trilinear_interp", {"X", "OutSize"}},
+    {"nearest_interp", {"X", "OutSize"}},
+    {"bicubic_interp", {"X", "OutSize"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -270,6 +275,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"split", {"Out"}},
     {"concat", {"Out"}},
     {"fused_multi_transformer", {"CacheKVOut"}},
+    {"group_norm", {"Mean", "Variance"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index a3310f1a46ce4..4d985097088f8 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -3016,9 +3016,15 @@ def __init__(self,
             is_bias=True)
 
     def forward(self, input):
-        if in_dygraph_mode():
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+
+        if _non_static_mode():
             attrs = ('epsilon', self._epsilon, 'groups', self._groups)
-            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
+            out, _, _ = _C_ops.group_norm(input, self.weight, self.bias,
+                                          mean_out, variance_out, *attrs)
 
             return dygraph_utils._append_activation_in_dygraph(out, self._act)
         else:
@@ -3029,10 +3035,6 @@ def forward(self, input):
                 inputs['Scale'] = self.weight
 
             # create output
-            mean_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True)
-            variance_out = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype, stop_gradient=True)
             group_norm_out = self._helper.create_variable_for_type_inference(
                 dtype=self._dtype)
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 757b1a2da95b9..bd453b3ddaa00 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -3600,6 +3600,10 @@ def append_op(self, *args, **kwargs):
             attrs = kwargs.get("attrs", {})
             inplace_map = kwargs.get("inplace_map", None)
             type = kwargs.get("type", None)
+            warnings.warn(
+                "Op `%s` is executed through `append_op` under the dynamic mode, "
+                "the corresponding API implementation needs to be upgraded to "
+                "using `_C_ops` method." % type, DeprecationWarning)
             op = Operator(
                 block=self,
                 desc=None,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 97506ead5fad4..3391654f93117 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7793,10 +7793,18 @@ def _is_list_or_turple_(data):
     }
 
     if out_shape is not None:
-        if isinstance(out_shape, Variable):
+        if isinstance(out_shape, Variable) and not _non_static_mode():
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
+            if _non_static_mode():
+                if isinstance(out_shape, Variable):
+                    out_shape = list(out_shape.numpy())
+                else:
+                    out_shape = list(out_shape)
+                for i, dim in enumerate(out_shape):
+                    if isinstance(dim, Variable):
+                        out_shape[i] = dim.numpy()[0]
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError(
                     "out_shape should be a list or tuple or Variable.")
@@ -7863,7 +7871,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[2]
 
     else:
-        if isinstance(scale, Variable):
+        if _non_static_mode() and isinstance(scale, Variable):
+            scale = scale.numpy()
+        elif isinstance(scale, Variable):
             scale.stop_gradient = True
             inputs["Scale"] = scale
         elif isinstance(scale, float) or isinstance(scale, int):
@@ -7883,6 +7893,26 @@ def _is_list_or_turple_(data):
         inputs["OutSize"] = actual_shape
     elif actual_shape is not None:
         raise TypeError("actual_shape should either be Variable or None.")
+
+    if _non_static_mode():
+        attr_list = []
+        for k, v in attrs.items():
+            attr_list.append(k)
+            attr_list.append(v)
+        dy_attr = tuple(attr_list)
+
+        if resample_type == "linear":
+            out = _C_ops.linear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "bilinear":
+            out = _C_ops.bilinear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "trilinear":
+            out = _C_ops.trilinear_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "nearest":
+            out = _C_ops.nearest_interp(input, actual_shape, *dy_attr)
+        elif resample_type == "bicubic":
+            out = _C_ops.bicubic_interp(input, actual_shape, *dy_attr)
+        return out
+
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='{}_interp'.format(resample_type),
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index b02c154584e9c..3b1fcc15ab95f 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -681,14 +681,19 @@ def assign(input, output=None):
                              "saving it to file and 'load_op' to load it")
         if output is None:
             output = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
+        if _non_static_mode():
+            _C_ops.assign_value(output, 'shape',
+                                list(input.shape), 'dtype', dtype, value_name,
+                                values)
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
 
     if is_inplace and _non_static_mode():
         output._bump_inplace_version()
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 7c3e3ad8dee9f..6cdfc36d5d61f 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -32,7 +32,7 @@
 from ...fluid.dygraph import BatchNorm  # noqa: F401
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
-from ...framework import get_default_dtype, set_default_dtype
+from ...framework import get_default_dtype, set_default_dtype, _non_static_mode
 
 from ..initializer import Constant
 from ...framework import ParamAttr
@@ -404,6 +404,25 @@ def __init__(self,
             self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=input.dtype, stop_gradient=True)
+
+        if _non_static_mode():
+            pre_act, _, _ = _C_ops.group_norm(
+                input,
+                self.weight,
+                self.bias,
+                mean_out,
+                variance_out,
+                'epsilon',
+                self._epsilon,
+                'groups',
+                self._num_groups, )
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None)
+
         inputs = {'X': input}
         if self.bias is not None:
             inputs['Bias'] = self.bias
@@ -411,10 +430,6 @@ def forward(self, input):
             inputs['Scale'] = self.weight
 
         # create output
-        mean_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
-        variance_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
         group_norm_out = self._helper.create_variable_for_type_inference(
             dtype=input.dtype)
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index d3430ba81b859..e37ca981f851c 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1568,14 +1568,19 @@ def assign(x, output=None):
         if output is None:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
-        helper.append_op(
-            type='assign_value',
-            outputs={'Out': [output]},
-            attrs={
-                'dtype': dtype,
-                'shape': list(input.shape),
-                value_name: values
-            })
+        if _non_static_mode():
+            _C_ops.assign_value(output, 'shape',
+                                list(input.shape), 'dtype', dtype, value_name,
+                                values)
+        else:
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values
+                })
 
     if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()

From 632027d74a3199e89bde2568a6ab344777fd7be3 Mon Sep 17 00:00:00 2001
From: BrilliantYuKaimin <91609464+BrilliantYuKaimin@users.noreply.github.com>
Date: Tue, 31 May 2022 15:54:47 +0800
Subject: [PATCH 093/109] test=document_fix (#42922)

---
 python/paddle/nn/initializer/normal.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index 6fee5058057cb..2d0cd77ee17e9 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -60,19 +60,19 @@ def __init__(self, mean=0.0, std=1.0, name=None):
 
 
 class TruncatedNormal(TruncatedNormalInitializer):
-    """The Random TruncatedNormal (Gaussian) distribution initializer.
+    """The truncated normal distribution (Gaussian distribution) initializer.
 
     Args:
-        mean (float, optional): mean of the normal distribution. The default value is 0.0.
-        std (float, optional): standard deviation of the normal distribution. The default value is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        mean (float, optional): Mean of the normal distribution. The default value is :math:`0.0`.
+        std (float, optional): Standard deviation of the normal distribution. The default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A parameter initialized by Random TruncatedNormal (Gaussian) distribution.
+        A parameter initialized by truncated normal distribution (Gaussian distribution).
 
     Examples:
         .. code-block:: python
+            :name: initializer_TruncatedNormal-example
 
             import paddle
 

From 48409529b68b5767e2465222a235700ec25a367d Mon Sep 17 00:00:00 2001
From: David Nicolas <37790151+liyongchao911@users.noreply.github.com>
Date: Tue, 31 May 2022 15:54:55 +0800
Subject: [PATCH 094/109] update RandomCrop class code annotation;
 test=document_fix (#42428)

* update RandomCrop class code annotation; test=document_fix

* update adjust_brightness api in functional.py test=document_fix

* udpate uniform api in random.py

* update transforms.py
---
 python/paddle/tensor/random.py                | 28 ++++++--------
 python/paddle/vision/transforms/functional.py |  9 ++++-
 python/paddle/vision/transforms/transforms.py | 38 ++++++++++++++-----
 3 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 1194d81a360db..49671d65b6d44 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -30,7 +30,7 @@
 def bernoulli(x, name=None):
     """
 
-    This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
+    Returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
     The input ``x`` is a tensor with probabilities for generating the random binary number.
     Each element in ``x`` should be in [0, 1], and the out is generated by:
     
@@ -86,7 +86,7 @@ def bernoulli(x, name=None):
 
 def poisson(x, name=None):
     r"""
-    This OP returns a tensor filled with random number from a Poisson Distribution.
+    Returns a tensor filled with random number from a Poisson Distribution.
 
     .. math::
 
@@ -129,7 +129,7 @@ def poisson(x, name=None):
 
 def multinomial(x, num_samples=1, replacement=False, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a Multinomical
+    Returns a Tensor filled with random values sampled from a Multinomical
     distribution. The input ``x`` is a tensor with probabilities for generating the
     random number. Each element in ``x`` should be larger or equal to 0, but not all
     0. ``replacement`` indicates whether it is a replaceable sample. If ``replacement``
@@ -278,7 +278,7 @@ def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
 
 def standard_normal(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a standard
+    Returns a Tensor filled with random values sampled from a standard
     normal distribution with mean 0 and standard deviation 1, with ``shape``
     and ``dtype``.
 
@@ -387,7 +387,7 @@ def randn(shape, dtype=None, name=None):
 
 def normal(mean=0.0, std=1.0, shape=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a normal
+    Returns a Tensor filled with random values sampled from a normal
     distribution with ``mean`` and ``std`` (standard deviation) .
 
     If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``.
@@ -475,7 +475,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
 
 def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a uniform
+    Returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
     Examples:
@@ -505,20 +505,16 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
             it will use the seed of the global default generator (which can be set by paddle.seed). 
             Note that if seed is not 0, this operator will always generate the same random numbers every
             time. Default is 0.
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        name(str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: A Tensor filled with random values sampled from a uniform
         distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
+          :name: code-example1
             
             import paddle
 
@@ -625,7 +621,7 @@ def uniform_(x, min=-1.0, max=1.0, seed=0, name=None):
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random integers from a discrete uniform
+    Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
     If ``high`` is None (the default), the range is [0, ``low``).
 
@@ -731,7 +727,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
 def randint_like(x, low=0, high=None, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random integers from a discrete uniform
+    Returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with the same shape as ``x``.
     (use ``dtype`` if ``dtype`` is not None) 
     If ``high`` is None (the default), the range is [0, ``low``).
@@ -957,7 +953,7 @@ def randperm(n, dtype="int64", name=None):
 
 def rand(shape, dtype=None, name=None):
     """
-    This OP returns a Tensor filled with random values sampled from a uniform
+    Returns a Tensor filled with random values sampled from a uniform
     distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
     Args:
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 90fba1c4130e5..7927e9faee370 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -380,6 +380,7 @@ def adjust_brightness(img, brightness_factor):
 
     Examples:
         .. code-block:: python
+           :name: code-example1
 
             import numpy as np
             from PIL import Image
@@ -388,9 +389,13 @@ def adjust_brightness(img, brightness_factor):
             fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
             fake_img = Image.fromarray(fake_img)
+            print(fake_img.size) # (300, 256)
+            print(fake_img.load()[1,1]) # (95, 127, 202)
+            converted_img = F.adjust_brightness(fake_img, 0.5)
+            print(converted_img.size) # (300, 256)
+            print(converted_img.load()[1,1]) # (47, 63, 101)
+
 
-            converted_img = F.adjust_brightness(fake_img, 0.4)
-            print(converted_img.size)
     """
     if not (_is_pil_image(img) or _is_numpy_image(img) or
             _is_tensor_image(img)):
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index fea2efb1fb2b1..31f56e890558c 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -1042,14 +1042,32 @@ class RandomCrop(BaseTransform):
         size (sequence|int): Desired output size of the crop. If size is an
             int instead of sequence like (h, w), a square crop (size, size) is
             made.
-        padding (int|sequence|optional): Optional padding on each border
+        padding (int|sequence, optional): Optional padding on each border
             of the image. If a sequence of length 4 is provided, it is used to pad left, 
-            top, right, bottom borders respectively. Default: 0.
-        pad_if_needed (boolean|optional): It will pad the image if smaller than the
+            top, right, bottom borders respectively. Default: None, without padding.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Default: False.
+        fill (float|tuple, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0.
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                   padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                   will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                     padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                     will result in [2, 1, 1, 2, 3, 4, 4, 3]
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
     
-    Shape:
+    Shape
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A random cropped image.
 
@@ -1059,17 +1077,17 @@ class RandomCrop(BaseTransform):
     Examples:
     
         .. code-block:: python
+          :name: code-example1
 
-            import numpy as np
-            from PIL import Image
+            import paddle
             from paddle.vision.transforms import RandomCrop
-
             transform = RandomCrop(224)
 
-            fake_img = Image.fromarray((np.random.rand(324, 300, 3) * 255.).astype(np.uint8))
+            fake_img = paddle.randint(0, 255, shape=(3, 324,300), dtype = 'int32')
+            print(fake_img.shape) # [3, 324, 300]
 
-            fake_img = transform(fake_img)
-            print(fake_img.size)
+            crop_img = transform(fake_img)
+            print(crop_img.shape) # [3, 224, 224]
     """
 
     def __init__(self,

From e680d581c4ff906e84ae273d2c2b3dbee96ee9db Mon Sep 17 00:00:00 2001
From: yaozhixin <zhixiny@graphcore.ai>
Date: Tue, 31 May 2022 16:25:40 +0800
Subject: [PATCH 095/109] [IPU] support paddle.distributed.launch with IPUs
 (#43087)

* [IPU] support paddle.distributed.launch with IPUs

* add device_num to env_args_mapping
---
 .../distributed/launch/context/args_envs.py   |   7 +
 .../distributed/launch/context/device.py      |  12 +-
 .../launch/controllers/collective.py          |   6 +-
 python/paddle/distributed/launch/main.py      |   4 +-
 .../distributed/launch/plugins/__init__.py    |  18 +-
 .../distributed/launch/utils/ipu_launch.py    | 167 ++++++++++++++++
 .../unittests/ipu/distributed/run_dist_ipu.sh |  80 ++++++++
 .../test_dist_data_parallel_ipu.py            | 184 ++++++++++++++++++
 .../distributed/test_dist_pod128_sample.py    | 111 +++++++++++
 .../ipu/distributed/test_dist_sample.py       | 177 +++++++++++++++++
 10 files changed, 761 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/distributed/launch/utils/ipu_launch.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py

diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index ea8bf3d597a79..b70dd7d3f759f 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -35,6 +35,7 @@
     'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
     'PADDLE_GLOO_PORT': 'gloo_port',
     'PADDLE_WITH_GLOO': 'with_gloo',
+    'PADDLE_DEVICE_NUM': 'device_num'
 }
 
 
@@ -100,6 +101,12 @@ def parse_args():
         default=None,
         help="accelerate devices. as --gpus,npus,xps")
 
+    base_group.add_argument(
+        "--device_num",
+        type=int,
+        default=None,
+        help="the number of accelerate devices.")
+
     base_group.add_argument("--host", type=str, default=None, help="host ip")
 
     base_group.add_argument(
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
index 30b8cc1538590..61ffe8e809564 100644
--- a/python/paddle/distributed/launch/context/device.py
+++ b/python/paddle/distributed/launch/context/device.py
@@ -21,6 +21,7 @@ class DeviceType:
     XPU = 'xpu'
     NPU = 'npu'
     MLU = 'mlu'
+    IPU = 'ipu'
 
 
 class Device(object):
@@ -68,12 +69,18 @@ def get_selected_device_key(self):
             return 'FLAGS_selected_xpus'
         if self._dtype == DeviceType.MLU:
             return 'FLAGS_selected_mlus'
+        if self._dtype == DeviceType.IPU:
+            return 'FLAGS_selected_ipus'
         return 'FLAGS_selected_devices'
 
-    def get_selected_devices(self, devices=''):
+    def get_selected_devices(self, devices='', device_num=None):
         '''
         return the device label/id relative to the visible devices
         '''
+        if self._dtype == DeviceType.IPU:
+            if not device_num:
+                raise RuntimeError("The \'device_num\' is required by IPUs.")
+            return [str(device_num)]
         if not devices:
             return [str(x) for x in range(0, len(self._labels))]
         else:
@@ -129,6 +136,9 @@ def detect_device(self):
             dev._dtype = DeviceType.MLU
             num = fluid.core.get_mlu_device_count()
             visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_ipu():
+            dev._dtype = DeviceType.IPU
+            num = fluid.core.get_ipu_device_count()
 
         if num == 0:
             dev._dtype = DeviceType.CPU
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 5225fd6e81ff1..166eb3a4f9dfd 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -79,7 +79,8 @@ def build_pod(self):
         self.pod.reset()
         selected_dev_key = self.ctx.node.device.get_selected_device_key()
         selected_dev_list = self.ctx.node.device.get_selected_devices(
-            self.ctx.args.devices)
+            self.ctx.args.devices, self.ctx.args.device_num)
+
         for i in range(self.pod.replicas):
             e = {
                 "PADDLE_MASTER": collective_master,
@@ -95,7 +96,8 @@ def build_pod(self):
                 "PADDLE_TRAINERS_NUM": "{}".format(global_size),
                 "PADDLE_RANK_IN_NODE": str(i),
             }
-            if self.pod.replicas == 1:
+
+            if self.pod.replicas == 1 or self.ctx.node.device.dtype == "ipu":
                 e.update({selected_dev_key: ",".join(selected_dev_list)})
             else:
                 e.update({selected_dev_key: selected_dev_list[i]})
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index b2c87e737c82d..92585c9e7657a 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -52,7 +52,9 @@ def launch():
 
         - ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
 
-        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+        - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu/ipu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
+
+        - ``--device_num``: The number of selected accelerate devices on nodes, can be gpu/xpu/npu/mlu/ipu etc.. e.g., ``--device_num=4`` will require four devices per node.
 
         - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
diff --git a/python/paddle/distributed/launch/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
index 13c09b4c27c26..faa8f2823733c 100644
--- a/python/paddle/distributed/launch/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -25,6 +25,20 @@ def log(ctx):
     ctx.logger.info("--------------------------------------------------")
 
 
+def rewrite_ipu_script(ctx):
+    import paddle.fluid as fluid
+    if fluid.core.is_compiled_with_ipu():
+        import os
+        if ctx.args.training_script != "ipu":
+            raise RuntimeError(
+                "Only support to run the script \'ipu\' for IPU distributed computing."
+            )
+        ctx.args.training_script = os.path.abspath(
+            os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                "utils/ipu_launch.py"))
+
+
 def process_args(ctx):
     # reset device by args
     #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus
@@ -60,4 +74,6 @@ def rewrite_host_ip(ctx):
         ctx.node.ip = ctx.args.host
 
 
-enabled_plugins = [collective_compatible, rewrite_host_ip, process_args]
+enabled_plugins = [
+    collective_compatible, rewrite_host_ip, process_args, rewrite_ipu_script
+]
diff --git a/python/paddle/distributed/launch/utils/ipu_launch.py b/python/paddle/distributed/launch/utils/ipu_launch.py
new file mode 100644
index 0000000000000..595243cdf9d9c
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/ipu_launch.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+
+import subprocess
+import argparse
+import os
+import logging
+import sys
+
+
+class IPULaunch(object):
+    def __init__(self, hosts, ipus_per_replica, nproc_per_host, ipu_partition,
+                 vipu_server, training_script, training_script_args):
+        if not fluid.core.is_compiled_with_ipu():
+            raise RuntimeError(
+                "Can not call ipu_launch.py in non IPU compiled environment, please re-compile with WITH_IPU=ON."
+            )
+        self._hosts = hosts
+        self._ipus_per_replica = ipus_per_replica
+        self._nproc_per_host = nproc_per_host
+        self._ipu_partition = ipu_partition
+        self._vipu_server = vipu_server
+        self._training_script = training_script
+        self._training_script_args = training_script_args
+
+        self._num_ipus = int(os.getenv("FLAGS_selected_ipus"))
+        self.logger = self.get_logger()
+
+    @classmethod
+    def parse_ipu_args(self):
+        parser = argparse.ArgumentParser()
+        parser.add_argument(
+            "--hosts",
+            type=str,
+            help="The hosts for IPU PopRun distributd computing.")
+        parser.add_argument(
+            "--ipus_per_replica",
+            type=int,
+            help="The number of IPUs per replica.")
+        parser.add_argument(
+            "--nproc_per_host",
+            type=int,
+            help="The number of processes per host.")
+        parser.add_argument(
+            "--ipu_partition", type=str, help="The partition name of IPU.")
+        parser.add_argument(
+            "--vipu_server",
+            type=str,
+            help="The vipu server host to enable vipu.")
+        parser.add_argument(
+            "training_script",
+            type=str,
+            help="The full path to the single IPU replica training program/script to be launched in parallel."
+        )
+        parser.add_argument('training_script_args', nargs=argparse.REMAINDER)
+        args = parser.parse_args()
+
+        ipu_launch = IPULaunch(
+            hosts=args.hosts,
+            ipus_per_replica=args.ipus_per_replica,
+            nproc_per_host=args.nproc_per_host,
+            ipu_partition=args.ipu_partition,
+            vipu_server=args.vipu_server,
+            training_script=args.training_script,
+            training_script_args=args.training_script_args, )
+
+        return ipu_launch
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("LAUNCH")
+        logger.setLevel(level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def launch(self):
+        # The number of replicas for data parallel
+        assert (self._num_ipus % self._ipus_per_replica) == 0, \
+                    "The number of IPUs:{} mod the number of IPUs per replica:{} must == 0".format(self._num_ipus, self._ipus_per_replica)
+        num_replicas = self._num_ipus // self._ipus_per_replica
+        self.logger.info("The number of total replicas is {}.".format(
+            num_replicas))
+
+        # The number of processes
+        num_nodes = len(self._hosts.split(','))
+        num_procs = num_nodes * self._nproc_per_host
+        self.logger.info("The number of total processes is {}.".format(
+            num_procs))
+        assert (num_replicas % num_procs) == 0, \
+                    "The number of replicas:{} mod the number of processes:{} must == 0".format(num_replicas, num_procs)
+
+        # hosts and endpoints
+        hosts = self._hosts.replace(' ', '').split(',')
+        endpoints = [x + ":8090" for x in hosts]
+
+        # args for poprun
+        poprun_command = ['poprun']
+
+        poprun_command.append('--num-instances={}'.format(num_procs))
+        poprun_command.append('--num-replicas={}'.format(num_replicas))
+        poprun_command.append('--ipus-per-replica={}'.format(
+            self._ipus_per_replica))
+        poprun_command.append('--host={}'.format(','.join(hosts)))
+        poprun_command.append('--vipu-partition={}'.format(self._ipu_partition))
+        poprun_command.append('--vipu-server-host={}'.format(self._vipu_server))
+
+        poprun_command.extend([
+            '--update-partition=no', '--vipu-server-timeout=120',
+            '--print-topology=yes', '--numa-aware=yes'
+        ])
+
+        # global envs
+        global_envs = '--mpi-local-args=\''
+        log_level = os.getenv('POPART_LOG_LEVEL', None)
+        if log_level:
+            global_envs += '-x POPART_LOG_LEVEL={} '.format(log_level)
+        global_envs += '-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'.format(
+            num_procs, ','.join(endpoints))
+        global_envs += '\''
+        poprun_command.append(global_envs)
+
+        # local envs
+        for idx in range(num_procs):
+            cur_endpoint = endpoints[idx // self._nproc_per_host]
+            rank_in_node = idx % self._nproc_per_host
+            poprun_command.append(
+                '--instance-mpi-local-args={}:\"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}\"'.
+                format(idx, idx, cur_endpoint, rank_in_node))
+
+        # executor
+        poprun_command.append(sys.executable)
+
+        # script and script args
+        poprun_command.append(self._training_script)
+        for arg in self._training_script_args:
+            poprun_command.append(arg)
+
+        # for debug
+        print("-----------  PopRun Command -----------")
+        for i in range(len(poprun_command) - 1):
+            print("%s \\" % (poprun_command[i]))
+        print("%s" % (poprun_command[len(poprun_command) - 1]))
+        print("---------------------------------------")
+
+        # Launch
+        subprocess.run(" ".join(poprun_command), shell=True)
+
+
+if __name__ == '__main__':
+    ipu_launch = IPULaunch.parse_ipu_args()
+    ipu_launch.launch()
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
new file mode 100644
index 0000000000000..6f491ef107104
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+  
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+partition_name=pod64
+vipu_server=10.137.96.62
+allclose_script="
+import sys
+import numpy as np
+data1 = np.loadtxt(\"ipu_res.txt\")
+data2 = np.loadtxt(\"cpu_res.txt\")
+if np.allclose(data1[::16], data2, atol=1e-6):
+    sys.exit(0)
+else:
+    sys.exit(1)
+"
+
+for opt in lamb sgd adam ;
+do
+    for onchip in False True ;
+    do
+        for rts in False True ;
+        do
+            echo "Testcase: opt: ${opt}, onchip: ${onchip}, rts: ${rts}"
+            echo "paddle.distributed.fleet.launch test with IPUs..."
+            python3.7 -m paddle.distributed.launch \
+            --device_num=8 \
+            ipu \
+            --hosts=localhost \
+            --nproc_per_host=2 \
+            --ipus_per_replica=2 \
+            --ipu_partition=${partition_name} \
+            --vipu_server=${vipu_server} \
+            test_dist_data_parallel_ipu.py ${opt} ipu_res.txt ${onchip} ${rts} > ipu.log
+            echo "paddle.distributed.fleet.launch test with IPUs...Done"
+
+            echo "paddle normal test with CPU..."
+            export POPLAR_IPUMODEL=1
+            python3.7 test_dist_data_parallel_ipu.py ${opt} cpu_res.txt > cpu.log
+            unset POPLAR_IPUMODEL
+            echo "paddle normal test with CPU...Done"
+
+            echo "Compare results..."
+            python3.7 -c """${allclose_script}"""
+            if [ $? -eq 0 ];then
+            echo "Compare results...Done"
+            else
+            echo "Error occurs. Please check ipu.log, cpu.log, ipu_res.txt and cpu_res.txt"
+            exit 0
+            fi
+        done
+    done
+done
+
+if [ -f "ipu.log" ]; then
+    rm "ipu.log"
+fi
+if [ -f "cpu.log" ]; then
+    rm "cpu.log"
+fi
+if [ -f "ipu_res.txt" ]; then
+    rm "ipu_res.txt"
+fi
+if [ -f "cpu_res.txt" ]; then
+    rm "cpu_res.txt"
+fi
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
new file mode 100644
index 0000000000000..6054f2be7579e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py
@@ -0,0 +1,184 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import sys
+import os
+import random
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+mpi_comm = None
+
+
+@unittest.skip('Disable distributed tests on auto CI.')
+class TestBase(IPUOpTest):
+    def set_attrs(self, enable_ipu, optimizer, log, onchip=False, rts=False):
+        self.ipu_options = {
+            "enable_pipelining": True,
+            "batches_per_step": 1,
+            "enable_gradient_accumulation": True,
+            "accumulation_factor": 4,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2,
+            "location_optimizer": {
+                "on_chip": onchip,
+                "use_replicated_tensor_sharding": rts
+            }
+        }
+
+        self.cpu_bs = 16
+        self.ipu_bs = 1
+        self.optimizer = optimizer
+        self.log = log
+        self.enable_ipu = enable_ipu
+
+    def test(self):
+        seed = 2021
+        np.random.seed(seed)
+        random.seed(seed)
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = seed
+        startup_prog.random_seed = seed
+
+        bs = self.ipu_bs if self.enable_ipu else self.cpu_bs
+        data = np.random.rand(1, 3, 10, 10).astype(np.float32)
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                image = paddle.static.data(
+                    name='image', shape=[bs, 3, 10, 10], dtype='float32')
+                with paddle.static.ipu_shard_guard(index=0, stage=0):
+                    conv1 = paddle.static.nn.conv2d(
+                        image, num_filters=3, filter_size=3, bias_attr=False)
+                with paddle.static.ipu_shard_guard(index=1, stage=1):
+                    conv2 = paddle.static.nn.conv2d(
+                        conv1, num_filters=3, filter_size=3, bias_attr=False)
+                    # should consider influence of bs
+                    loss = paddle.mean(conv2)
+
+                if self.optimizer == 'sgd':
+                    opt = paddle.optimizer.SGD(learning_rate=1e-2)
+                elif self.optimizer == 'adam':
+                    opt = paddle.optimizer.Adam(learning_rate=1e-2)
+                elif self.optimizer == 'lamb':
+                    opt = paddle.optimizer.Lamb(learning_rate=1e-2)
+                else:
+                    raise Exception('optimizer must be sgd, adam or lamb')
+
+                opt.minimize(loss)
+
+                if self.enable_ipu:
+                    place = paddle.IPUPlace()
+                else:
+                    place = paddle.CPUPlace()
+                executor = paddle.static.Executor(place)
+                executor.run(startup_prog)
+
+                if self.enable_ipu:
+                    feed_list = [image.name]
+                    fetch_list = [loss.name]
+                    ipu_strategy = paddle.static.IpuStrategy()
+                    ipu_strategy.set_graph_config(
+                        num_ipus=2 * self.ipu_options['replicated_graph_count'],
+                        is_training=True,
+                        enable_manual_shard=True)
+                    ipu_strategy.set_options(self.ipu_options)
+                    ipu_strategy.set_options({
+                        "enable_distribution": True,
+                        "enable_distributed_replicated_graphs": True,
+                        "global_replica_offset":
+                        int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
+                        "global_replication_factor": 4
+                    })
+                    program = paddle.static.IpuCompiledProgram(
+                        main_prog, ipu_strategy=ipu_strategy).compile(
+                            feed_list, fetch_list)
+                    feed = {
+                        "image": np.tile(data, [
+                            self.ipu_options['replicated_graph_count'] *
+                            self.ipu_options['batches_per_step'] *
+                            self.ipu_options['accumulation_factor'], 1, 1, 1
+                        ])
+                    }
+
+                else:
+                    program = main_prog
+                    feed = {"image": np.tile(data, [self.cpu_bs, 1, 1, 1])}
+
+                epoch = 10
+                if not self.enable_ipu:
+                    # global replication factor
+                    epoch *= 4
+                    epoch *= self.ipu_options['batches_per_step']
+                    epoch *= self.ipu_options['accumulation_factor']
+                    epoch = epoch / (self.cpu_bs / self.ipu_bs)
+
+                results = []
+                for i in range(int(epoch)):
+                    res = executor.run(program, feed=feed, fetch_list=[loss])
+                    if self.enable_ipu:
+                        res = mpi_comm.gather(res, root=0)
+                    results.append(res)
+                if self.enable_ipu:
+                    if int(os.environ.get("PADDLE_TRAINER_ID")) == 0:
+                        np.savetxt(self.log, np.array(results).flatten())
+                else:
+                    np.savetxt(self.log, np.array(results).flatten())
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    # Run distributed tests    
+    if len(sys.argv) == 5:
+        from mpi4py import MPI
+
+        DISTRIBUTED_COMM = MPI.COMM_WORLD
+
+        def _get_comm():
+            global DISTRIBUTED_COMM
+            if DISTRIBUTED_COMM is None:
+                raise RuntimeError(
+                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
+                )
+            return DISTRIBUTED_COMM
+
+        mpi_comm = _get_comm()
+
+        optimizer = sys.argv[1]
+        log = sys.argv[2]
+        onchip = True if sys.argv[3] == "True" else False
+        rts = True if sys.argv[4] == "True" else False
+        test = TestBase()
+        test.set_attrs(
+            enable_ipu=True,
+            optimizer=optimizer,
+            log=log,
+            onchip=onchip,
+            rts=rts)
+        test.test()
+    # Run cpu tests for compare
+    elif len(sys.argv) == 3:
+        test = TestBase()
+        test.set_attrs(enable_ipu=False, optimizer=sys.argv[1], log=sys.argv[2])
+        test.test()
+    else:
+        raise ValueError(
+            "Only support 3 or 5 args. 3 for cpu test, 5 for ipu distributed test"
+        )
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
new file mode 100644
index 0000000000000..44c26d123ba39
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py
@@ -0,0 +1,111 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+python3.7 -m paddle.distributed.launch \
+--device_num=128 \
+ipu \
+--hosts=host1,host2 \
+--ipus_per_host=2 \
+--nproc_per_host=1 \
+--ipu_partition=pod128 \
+--vipu_server=lr17-1-ctrl \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
+
+Equal to:
+
+poprun \
+--host=localhost,host2 \
+--num-instances=2 \
+--num-replicas=64 \
+--ipus-per-replica=2 \
+--print-topology=yes \
+--vipu-partition=pod128_bert \
+--vipu-server-host=lr17-1-ctrl \
+--update-partition=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py
+'''
+
+import os
+import numpy as np
+import paddle
+
+
+def TestDistTraining():
+    paddle.enable_static()
+
+    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
+
+    scope = paddle.fluid.core.Scope()
+    main_prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    main_prog.random_seed = 42
+    startup_prog.random_seed = 42
+
+    np.random.seed(42)
+    input_data = np.random.uniform(0, 127, size=[128, 3, 2, 1]).astype(np.int32)
+
+    with paddle.fluid.scope_guard(scope):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
+            with paddle.static.ipu_shard_guard(index=0, stage=0):
+                out = paddle.fluid.layers.embedding(x, **attrs)
+            with paddle.static.ipu_shard_guard(index=1, stage=1):
+                loss = paddle.mean(out)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1)
+            opt.minimize(loss)
+
+            feed_list = ["x"]
+            fetch_list = [loss.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                num_ipus=64, is_training=True, enable_manual_shard=True)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=True,
+                batches_per_step=1,
+                enable_gradient_accumulation=True,
+                accumulation_factor=4)
+            ipu_strategy.set_options({
+                "enable_distribution": True,
+                "enable_replicated_graphs": True,
+                "replicated_graph_count": 32,
+                "enable_distributed_replicated_graphs": True,
+                "global_replica_offset":
+                # Paddle : int(os.environ.get("PADDLE_TRAINER_ID")) * 32
+                # PopRun : int(os.environ.get("POPDIST_REPLICA_INDEX_OFFSET"))
+                int(os.environ.get("PADDLE_TRAINER_ID")) * 32,
+                "global_replication_factor": 64,
+                "location_optimizer": {
+                    "on_chip": False,
+                    "use_replicated_tensor_sharding": True
+                }
+            })
+
+            ipu_program = paddle.static.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_program.compile(feed_list, fetch_list)
+
+            for i in range(10):
+                res = exe.run(program,
+                              feed={"x": input_data},
+                              fetch_list=fetch_list)
+                print("index: {}, result: {}".format(i, res))
+
+
+if __name__ == "__main__":
+    TestDistTraining()
diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
new file mode 100644
index 0000000000000..6ca9222d914de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py
@@ -0,0 +1,177 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Single host:
+
+python3.7 -m paddle.distributed.launch \
+--device_num=4 \
+ipu \
+--hosts=localhost \
+--nproc_per_host=2 \
+--ipus_per_replica=1 \
+--ipu_partition=pod64 \
+--vipu_server=10.137.96.62 \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+
+Equal to:
+
+poprun \
+--host=localhost \
+--num-instances=2 \
+--num-replicas=4 \
+--ipus-per-replica=1 \
+--print-topology=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+'''
+'''
+Multi hosts:
+
+python3.7 -m paddle.distributed.launch \
+--device_num=4 \
+ipu \
+--hosts=host1,host2 \
+--nproc_per_host=1 \
+--ipus_per_replica=1 \
+--ipu_partition=pod64 \
+--vipu_server=10.137.96.62 \
+python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+
+Equal to:
+
+poprun \
+--host=host1,host2 \
+--num-instances=2 \
+--num-replicas=4 \
+--ipus-per-replica=1 \
+--print-topology=yes \
+python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_sample.py
+'''
+
+import os
+import sys
+import paddle
+import numpy as np
+
+mpi_comm = None
+
+
+def Test(use_dist, file_name):
+    paddle.enable_static()
+
+    attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'}
+
+    scope = paddle.fluid.core.Scope()
+    main_prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    main_prog.random_seed = 42
+    startup_prog.random_seed = 42
+
+    with paddle.fluid.scope_guard(scope):
+        with paddle.static.program_guard(main_prog, startup_prog):
+            x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64')
+
+            out = paddle.fluid.layers.embedding(x, **attrs)
+            loss = paddle.mean(out)
+            opt = paddle.optimizer.Adam(learning_rate=1e-1)
+            opt.minimize(loss)
+
+            feed_list = ["x"]
+            fetch_list = [loss.name]
+
+            place = paddle.IPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            ipu_strategy = paddle.static.IpuStrategy()
+            if use_dist:
+                ipu_strategy.set_graph_config(num_ipus=2, is_training=True)
+                # Set distributed envs
+                ipu_strategy.set_options({
+                    "enable_distribution": True,
+                    "enable_replicated_graphs": True,
+                    "replicated_graph_count": 2,
+                    "enable_distributed_replicated_graphs": True,
+                    "global_replica_offset":
+                    int(os.environ.get("PADDLE_TRAINER_ID")) * 2,
+                    "global_replication_factor": 4
+                })
+            else:
+                ipu_strategy.set_graph_config(num_ipus=4, is_training=True)
+                ipu_strategy.set_options({
+                    "enable_replicated_graphs": True,
+                    "replicated_graph_count": 4,
+                })
+
+            ipu_program = paddle.static.IpuCompiledProgram(
+                main_prog, ipu_strategy=ipu_strategy)
+            program = ipu_program.compile(feed_list, fetch_list)
+
+            if use_dist:
+                if os.environ.get("PADDLE_TRAINER_ID") == "0":
+                    input_data = np.concatenate([
+                        np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
+                        .astype(np.int32), np.array(
+                            [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(
+                                np.int32)
+                    ])
+                else:
+                    input_data = np.concatenate([
+                        np.array([[[8], [60]], [[50], [77]],
+                                  [[90], [13]]]).astype(np.int32),
+                        np.array([[[8], [60]], [[50], [77]],
+                                  [[90], [13]]]).astype(np.int32)
+                    ])
+            else:
+                input_data = np.concatenate([
+                    np.array([[[1], [3]], [[2], [4]], [[4], [127]]]).astype(
+                        np.int32), np.array([[[1], [3]], [[2], [4]],
+                                             [[4], [127]]]).astype(np.int32),
+                    np.array([[[8], [60]], [[50], [77]], [[90], [13]]]).astype(
+                        np.int32), np.array([[[8], [60]], [[50], [77]],
+                                             [[90], [13]]]).astype(np.int32)
+                ])
+            feed_data = {"x": input_data}
+
+            for step in range(10):
+                res = exe.run(program, feed=feed_data, fetch_list=fetch_list)
+
+            if use_dist:
+                if os.getenv("PADDLE_TRAINER_ID") == "0":
+                    res = mpi_comm.gather(res, root=0)
+                    np.savetxt(file_name, res)
+            else:
+                np.savetxt(file_name, res)
+
+
+if __name__ == "__main__":
+    file_name = sys.argv[1]
+
+    use_dist = False
+    if 'PADDLE_TRAINER_ID' in os.environ:
+        from mpi4py import MPI
+
+        DISTRIBUTED_COMM = MPI.COMM_WORLD
+
+        def _get_comm():
+            global DISTRIBUTED_COMM
+            if DISTRIBUTED_COMM is None:
+                raise RuntimeError(
+                    "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first."
+                )
+            return DISTRIBUTED_COMM
+
+        mpi_comm = _get_comm()
+        use_dist = True
+
+    Test(use_dist, file_name)

From cb195fa0c349b0592974dbb206f0f708552db943 Mon Sep 17 00:00:00 2001
From: cambriconhsq <106155938+cambriconhsq@users.noreply.github.com>
Date: Tue, 31 May 2022 16:28:38 +0800
Subject: [PATCH 096/109] [MLU] add mlu kernel for abs op (#43099)

---
 paddle/fluid/operators/abs_op_mlu.cc          | 75 +++++++++++++++
 .../tests/unittests/mlu/test_abs_op_mlu.py    | 95 +++++++++++++++++++
 2 files changed, 170 insertions(+)
 create mode 100644 paddle/fluid/operators/abs_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py

diff --git a/paddle/fluid/operators/abs_op_mlu.cc b/paddle/fluid/operators/abs_op_mlu.cc
new file mode 100644
index 0000000000000..3a3a484ea775e
--- /dev/null
+++ b/paddle/fluid/operators/abs_op_mlu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class AbsMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc output_desc(*output);
+
+    MLUCnnl::Abs(ctx, input_desc.get(), GetBasePtr(input), output_desc.get(),
+                 GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class AbsGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*x);
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+
+    Tensor sign_x;
+    sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
+
+    MLUCnnl::Sign(ctx, input_desc.get(), GetBasePtr(x), input_desc.get(),
+                  GetBasePtr(&sign_x));
+    MLUCnnl::OpTensor(ctx, mul_op_desc.get(), input_desc.get(),
+                      GetBasePtr(&sign_x), input_desc.get(), GetBasePtr(dout),
+                      input_desc.get(), GetBasePtr(dx), ToCnnlDataType<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(abs, ops::AbsMLUKernel<float>,
+                       ops::AbsMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(abs_grad, ops::AbsGradMLUKernel<float>,
+                       ops::AbsGradMLUKernel<plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
new file mode 100644
index 0000000000000..0c33bd6b1ade8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_abs_op_mlu.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestAbs(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        self.set_mlu()
+        self.dtype = 'float32'
+        self.shape = [4, 25]
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], check_eager=False)
+
+
+class TestAbsHalf(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        self.set_mlu()
+        self.dtype = 'float16'
+        self.shape = [7, 9, 13, 19]
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # Because we set delta = 0.005 in calculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is inaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], ['Out'], check_eager=False)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 172739d4935c727d9a20c54236ed08691e8f4d1d Mon Sep 17 00:00:00 2001
From: BrilliantYuKaimin <91609464+BrilliantYuKaimin@users.noreply.github.com>
Date: Tue, 31 May 2022 17:21:31 +0800
Subject: [PATCH 097/109] test=document_fix Verified (#42919)

---
 python/paddle/nn/initializer/uniform.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index cac03b5948071..f07883adbb0ae 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -18,19 +18,19 @@
 
 
 class Uniform(UniformInitializer):
-    """The random uniform distribution initializer.
+    """The uniform distribution initializer.
 
     Args:
-        low (float, optional): lower boundary of the uniform distribution. The default value is -1.0.
-        high (float, optional): upper boundary of the uniform distribution. The default value is 1.0.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        low (float, optional): Lower boundary of the uniform distribution. The default value is :math:`-1.0`.
+        high (float, optional): Upper boundary of the uniform distribution. The default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A parameter initialized by random uniform distribution.
+        A parameter initialized by uniform distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_Uniform-example
 
             import paddle
 

From c9e7c407612e3746c4a218344d0b2be8916a7a6f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 31 May 2022 18:37:45 +0800
Subject: [PATCH 098/109] [Phi] Polish assign kernel copy impl (#43061)

* fix assign kernel copy impl

* fix test failed
---
 paddle/phi/kernels/assign_kernel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 5ed9d72a503a5..2349bf990acd3 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -26,7 +26,7 @@ template <typename Context>
 void AssignKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out) {
-  Copy<Context>(dev_ctx, x, x.place(), false, out);
+  paddle::framework::TensorCopy(x, x.place(), out);
 }
 
 template <typename Context>

From d70e45bc51e607677069d9cf3cc154dac5934bdf Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Tue, 31 May 2022 21:34:59 +0800
Subject: [PATCH 099/109] put set error_code infront to avoid being skipped
 (#43014)

---
 paddle/scripts/paddle_build.bat | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 0f70f9a8f3564..2a18d2f7e0195 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -85,6 +85,9 @@ if not defined NEW_RELEASE_JIT set NEW_RELEASE_JIT=OFF
 set task_name=%1
 set UPLOAD_TP_FILE=OFF
 
+set error_code=0
+type %cache_dir%\error_code.txt
+
 rem ------initialize set git config------
 git config --global core.longpaths true
 
@@ -118,8 +121,6 @@ if "%WITH_CACHE%"=="OFF" (
     goto :mkbuild
 )
 
-set error_code=0
-type %cache_dir%\error_code.txt
 : set /p error_code=< %cache_dir%\error_code.txt
 if %error_code% NEQ 0 (
     rmdir %BUILD_DIR% /s/q

From 462ae0054a7be6708d631b888523aed76f376c1a Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Tue, 31 May 2022 21:47:39 +0800
Subject: [PATCH 100/109] [Eager] Fix Full Zero (#43048)

* fix full zero

* fix full zero

* fix full zero

* fix full zero

* refine

* refine

* refine
---
 .../auto_code_generator/eager_generator.cc    | 60 ++++++++++++-------
 .../tests/task_tests/eager_utils_test.cc      |  4 +-
 .../eager/to_static/run_program_op_node.h     |  4 +-
 paddle/fluid/eager/utils.cc                   | 28 +++------
 paddle/fluid/eager/utils.h                    |  8 +--
 5 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 3a9bac833d588..817a0de6e0ca9 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -2043,6 +2043,32 @@ static std::string GenerateSingleOpBase(
   const std::string& ins_name = "ins" + std::to_string(*outs_size);
   const std::string& outs_name = "outs" + std::to_string(*outs_size);
   const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size);
+  const std::string& hooked_grads = "hooked_grads" + std::to_string(*outs_size);
+
+  // [Generation] Get Full Zero
+  std::string fill_zero_str = "";
+  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
+    for (auto iter : grad_ins) {
+      const std::string& grad_input_name = iter.first;
+      if (grad_ins_grad_slotname_map.count(grad_input_name)) {
+        size_t fwd_output_position = fwd_outputs_name_pos_map.at(
+            grad_ins_grad_slotname_map.at(grad_input_name));
+        const char* FILL_ZERO_TEMPLATE =
+            "egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[%d], "
+            "this->InputMeta()[%d]);\n";
+        fill_zero_str += paddle::string::Sprintf(
+            FILL_ZERO_TEMPLATE, fwd_output_position, fwd_output_position);
+      }
+    }
+  }
+  generated_grad_function_body += fill_zero_str;
+  generated_grad_function_body +=
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> " +
+      hooked_grads +
+      " = "
+      "GradNode" +
+      fwd_op_type + "::ApplyGradientHooks(grads);\n";
 
   // [Generation] Get Ins Map
   std::unordered_set<std::string> dispensable_input_name_set;
@@ -2117,16 +2143,16 @@ static std::string GenerateSingleOpBase(
       size_t fwd_output_position = fwd_outputs_name_pos_map.at(
           grad_ins_grad_slotname_map.at(grad_input_name));
       const char* GRAD_INS_GRAD_CONTENT_TEMPLATE =
-          "{ \"%s\", egr::EagerUtils::TrySyncToVars(hooked_grads[%d]) },";
+          "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s[%d]) },";
       ins_contents_str += paddle::string::Sprintf(
-          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, fwd_output_position);
+          GRAD_INS_GRAD_CONTENT_TEMPLATE, grad_input_name, hooked_grads,
+          fwd_output_position);
       if (!backward_inplace_map.empty() &&
           backward_inplace_map.count(grad_input_name)) {
         process_backward_inplace = true;
-        const char* GRAD_INS_HOOKED_GRAD_TEMPLATE =
-            "auto& %s = hooked_grads[%d][0];";
+        const char* GRAD_INS_HOOKED_GRAD_TEMPLATE = "auto& %s = %s[%d][0];";
         std::string hooked_grads_tensor_str = paddle::string::Sprintf(
-            GRAD_INS_HOOKED_GRAD_TEMPLATE, bwd_inplace_input_name,
+            GRAD_INS_HOOKED_GRAD_TEMPLATE, bwd_inplace_input_name, hooked_grads,
             fwd_output_position);
         const char* GRAD_INS_GRAD_TENSOR_TEMPLATE = "grads[%d][0]";
         std::string grads_tensor_str = paddle::string::Sprintf(
@@ -2239,10 +2265,10 @@ static std::string GenerateSingleOpBase(
         const char* GRAD_OUTS_CONTENT_TEMPLATE =
             " if((!out_metas[%d].empty()) && "
             "(!(out_metas[%d][0].IsStopGradient()))){ \n %s.insert({ \"%s\", "
-            "egr::EagerUtils::TrySyncToVars(hooked_grads[%d])});} \n ";
+            "egr::EagerUtils::TrySyncToVars(%s[%d])});} \n ";
         outs_contents_str += paddle::string::Sprintf(
             GRAD_OUTS_CONTENT_TEMPLATE, grads_position, grads_position,
-            outs_name, grad_output_name, grads_position);
+            outs_name, grad_output_name, hooked_grads, grads_position);
 
       } else {
         if (dispensable_input_name_set.count(fwd_name) &&
@@ -2561,9 +2587,6 @@ static std::string GenerateGradNodeCCContents(
   }
 
   const char* BWD_RETURN_TEMPLATE =
-      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
-      "egr::kSlotSmallVectorSize> hooked_grads = "
-      "GradNode%s::ApplyGradientHooks(grads);\n"
       "  const auto& out_metas = OutputMeta();\n"
       "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
       "egr::kSlotSmallVectorSize> outputs(%d);\n"
@@ -2571,9 +2594,8 @@ static std::string GenerateGradNodeCCContents(
       "  if(NeedComplexToRealConversion()) "
       "HandleComplexGradToRealGrad(&outputs);\n"
       "  return outputs;\n";
-  generated_grad_function_body =
-      paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(),
-                              generated_grad_function_body);
+  generated_grad_function_body = paddle::string::Sprintf(
+      BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body);
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
@@ -2584,17 +2606,9 @@ static std::string GenerateGradNodeCCContents(
       "egr::kSlotSmallVectorSize>& grads, bool "
       "create_graph, bool is_new_grad) {\n"
       "%s"
-      "%s"
       "\n}";
-  std::string fill_zero_str = "";
-  if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
-    fill_zero_str =
-        "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
-        "this->InputMeta());\n";
-  }
-  std::string grad_function_str =
-      paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
-                              fill_zero_str, generated_grad_function_body);
+  std::string grad_function_str = paddle::string::Sprintf(
+      GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
 
   VLOG(6) << "Generated returns";
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index bcb9820419d0f..551262d259e08 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -250,7 +250,7 @@ TEST(EagerUtils, GetGradAccumulationNode) {
   ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
 }
 
-TEST(EagerUtils, FillZeroForEmptyGradInputs) {
+TEST(EagerUtils, FillZeroForEmptyOptionalGradInput) {
   paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                        egr::kSlotSmallVectorSize>
       grads = {std::vector<paddle::experimental::Tensor>(1)};
@@ -263,7 +263,7 @@ TEST(EagerUtils, FillZeroForEmptyGradInputs) {
   slot_metas[0][0].SetTensorMeta(tensor_meta);
   slot_metas[0][0].SetPlace(phi::CPUPlace());
 
-  EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
+  EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0], slot_metas[0]);
   eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
 }
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index fe1cdefb7d572..5a730e4dbf164 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -379,8 +379,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
                           "The hooked_grads.size() of RunProgramGradOp should "
                           "be equal to 1."));
 
-    egr::EagerUtils::FillZeroForEmptyGradInputs(&hooked_grads,
-                                                this->InputMeta());
+    egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&hooked_grads[0],
+                                                       this->InputMeta()[0]);
     VLOG(3) << "hooked_grads[0].size() : " << hooked_grads[0].size();
     std::vector<paddle::experimental::Tensor> x_grad;
     std::vector<paddle::experimental::Tensor> params_grad;
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 9ccd91ca65733..7d9554c52eb6c 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -467,26 +467,16 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
   }
 }
 
-void EagerUtils::FillZeroForEmptyGradInputs(
-    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                         kSlotSmallVectorSize>* in_grads,
-    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
-        grad_in_metas) {
+void EagerUtils::FillZeroForEmptyOptionalGradInput(
+    std::vector<paddle::experimental::Tensor>* in_grads,
+    const std::vector<GradSlotMeta>& grad_in_metas) {
   for (size_t i = 0; i < in_grads->size(); i++) {
-    for (size_t j = 0; j < (*in_grads)[i].size(); j++) {
-      paddle::experimental::Tensor& grad = (*in_grads)[i][j];
-      if (!grad.initialized()) {
-        const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
-        PADDLE_ENFORCE(
-            grad_in_meta.HasTensorMeta(),
-            paddle::platform::errors::Fatal(
-                "Unable to fill empty grad inputs due to empty GradSlotMeta"));
-        const auto& tensor_meta = grad_in_meta.GetTensorMeta();
-        auto tensor_with_zero = paddle::experimental::full(
-            phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype,
-            grad_in_meta.GetPlace());
-        grad.set_impl(tensor_with_zero.impl());
-      }
+    paddle::experimental::Tensor& grad = (*in_grads)[i];
+    if (!grad.initialized() && grad_in_metas[i].HasTensorMeta()) {
+      auto tensor_with_zero = paddle::experimental::full(
+          phi::vectorize(grad_in_metas[i].GetTensorMeta().dims), 0.0,
+          grad_in_metas[i].GetTensorMeta().dtype, grad_in_metas[i].GetPlace());
+      grad.set_impl(tensor_with_zero.impl());
     }
   }
 }
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 63baebca53c37..c6389e998315c 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -236,11 +236,9 @@ class EagerUtils {
   /**
     * Fill Zero
     * **/
-  static void FillZeroForEmptyGradInputs(
-      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                           kSlotSmallVectorSize>* out_grads,
-      const paddle::small_vector<std::vector<GradSlotMeta>,
-                                 kSlotSmallVectorSize>& grad_out_metas);
+  static void FillZeroForEmptyOptionalGradInput(
+      std::vector<paddle::experimental::Tensor>* in_grads,
+      const std::vector<GradSlotMeta>& grad_in_metas);
   static void FillZeroForEmptyGradInput(paddle::experimental::Tensor* in_grad,
                                         const GradSlotMeta& grad_in_meta);
   static void FillZeroForEmptyOptionalGradInput(

From 941942755d2bc650360dfda1e48cd057c27ecbdc Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Tue, 31 May 2022 21:50:43 +0800
Subject: [PATCH 101/109] add double_grad and triple_grad inplace info in
 backward.yaml (#43124)

* add double_grad and triple_grad inplace info in backward.yaml

* only generate inplace api in forward
---
 python/paddle/utils/code_gen/api_base.py   | 28 +---------------------
 python/paddle/utils/code_gen/api_gen.py    | 27 +++++++++++++++++++++
 python/paddle/utils/code_gen/backward.yaml | 19 +++++++++++++++
 3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 1638f6afab20c..1f19dec992d2f 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -48,8 +48,7 @@ def __init__(self, api_item_yaml):
                 'func']) == 1 or not self.kernel['func'][1].endswith(
                     '_sr') else True
             self.data_transform = self.parse_data_transform(api_item_yaml)
-            self.inplace_map, self.view_map = self.parse_inplace_and_view(
-                api_item_yaml)
+            self.inplace_map, self.view_map = {}, {}
 
     def get_api_name(self, api_item_yaml):
         return api_item_yaml['api']
@@ -303,31 +302,6 @@ def parse_data_transform(self, api_item_yaml):
 
         return data_transform
 
-    def parse_inplace_and_view(self, api_item_yaml):
-        inplace_map, view_map = {}, {}
-        for mode in ['inplace', 'view']:
-            if mode in api_item_yaml:
-                if mode == 'inplace':
-                    inplace_map = {}
-                else:
-                    view_map = {}
-                in_out_mapping_list = api_item_yaml[mode].split(',')
-                for item in in_out_mapping_list:
-                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
-                    in_val = result.group('in')
-                    out_val = result.group('out')
-                    assert in_val in self.inputs['names'], \
-                        f"{self.api} : {mode} input error: the input var name('{in_val}') is not found in the input args of {self.api}."
-                    assert out_val in self.outputs['names'], \
-                        f"{self.api} : {mode} output error: the output var name('{out_val}') is not found in the output args of {self.api}."
-
-                    if mode == 'inplace':
-                        inplace_map[out_val] = in_val
-                    else:
-                        view_map[out_val] = in_val
-
-        return inplace_map, view_map
-
     # Override by child class
     def get_return_type(self, inplace_flag=False):
         return None
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index c0923adf39c46..1721da19295d5 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -30,6 +30,8 @@ def __init__(self, api_item_yaml):
         super(ForwardAPI, self).__init__(api_item_yaml)
         self.is_dygraph_api, self.intermediate_outs = self.parse_intermediate(
             api_item_yaml)
+        self.inplace_map, self.view_map = self.parse_inplace_and_view(
+            api_item_yaml)
 
     def get_api_func_name(self):
         if self.is_dygraph_api:
@@ -47,6 +49,31 @@ def parse_intermediate(self, api_item_yaml):
         else:
             return False, []
 
+    def parse_inplace_and_view(self, api_item_yaml):
+        inplace_map, view_map = {}, {}
+        for mode in ['inplace', 'view']:
+            if mode in api_item_yaml:
+                if mode == 'inplace':
+                    inplace_map = {}
+                else:
+                    view_map = {}
+                in_out_mapping_list = api_item_yaml[mode].split(',')
+                for item in in_out_mapping_list:
+                    result = re.search(r"(?P<in>\w+)\s*->\s*(?P<out>\w+)", item)
+                    in_val = result.group('in')
+                    out_val = result.group('out')
+                    assert in_val in self.inputs['names'], \
+                        f"{self.api} : {mode} input error: the input var name('{in_val}') is not found in the input args of {self.api}."
+                    assert out_val in self.outputs['names'], \
+                        f"{self.api} : {mode} output error: the output var name('{out_val}') is not found in the output args of {self.api}."
+
+                    if mode == 'inplace':
+                        inplace_map[out_val] = in_val
+                    else:
+                        view_map[out_val] = in_val
+
+        return inplace_map, view_map
+
     def get_return_type_with_intermediate(self, inplace_flag=False):
         out_type_list = []
         for i, out_type in enumerate(self.outputs['types']):
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 7183d822e15c0..e6e26d7e5ecac 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -56,6 +56,7 @@
     func : add_double_grad
   optional : grad_x_grad, grad_y_grad
   backward : add_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : add_grad
   forward : add (Tensor x, Tensor y) -> Tensor(out)
@@ -86,6 +87,7 @@
     param : [grad_grad_x, grad_grad_y]
   kernel :
     func : add_triple_grad
+  inplace : (grad_grad_out_grad -> grad_grad_x_grad)
 
 - backward_api : addmm_grad
   forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
@@ -193,6 +195,7 @@
     func : batch_norm_grad_grad
     data_type : x
   optional : out_mean, out_variance
+  inplace : (grad_out -> grad_out_grad)
 
 - backward_api : batch_norm_grad
   forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
@@ -261,6 +264,7 @@
     param : [x, x]
   kernel :
     func : celu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : celu_grad
   forward : celu(Tensor x, float alpha) -> Tensor(out)
@@ -532,6 +536,7 @@
     func : divide_double_grad
     data_type : out
   optional : grad_x_grad, grad_y_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : divide_grad
   forward : divide (Tensor x, Tensor y) -> Tensor(out)
@@ -596,6 +601,7 @@
     param : [x, x]
   kernel :
     func : elu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : elu_grad
   forward : elu (Tensor x, float alpha) -> Tensor(out)
@@ -947,6 +953,7 @@
     param : [grad_x_grad]
   kernel :
     func : leaky_relu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : leaky_relu_grad
   forward : leaky_relu (Tensor x, float alpha) -> Tensor(out)
@@ -1022,6 +1029,7 @@
     param : [x, x]
   kernel :
     func : log_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : log_grad
   forward : log (Tensor x) -> Tensor(out)
@@ -1310,6 +1318,7 @@
     func : multiply_double_grad
   optional : grad_x_grad, grad_y_grad
   backward : multiply_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : multiply_grad
   forward : multiply (Tensor x, Tensor y) -> Tensor(out)
@@ -1557,6 +1566,7 @@
     param : [out]
   kernel :
     func : relu_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : relu_grad
   forward : relu (Tensor x) -> Tensor(out)
@@ -1580,6 +1590,7 @@
   kernel :
     func : reshape_double_grad
   no_need_buffer : grad_out
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : reshape_grad
   forward : reshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
@@ -1654,6 +1665,7 @@
     param : [out, out]
   kernel :
     func : rsqrt_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : rsqrt_grad
   forward : rsqrt (Tensor x) -> Tensor(out)
@@ -1753,6 +1765,7 @@
   kernel :
     func : sigmoid_double_grad
   backward : sigmoid_triple_grad
+  inplace : (grad_x_grad -> fwd_grad_out_grad)
 
 - backward_api : sigmoid_grad
   forward : sigmoid (Tensor x) -> Tensor(out)
@@ -1776,6 +1789,7 @@
   kernel :
     func : sigmoid_triple_grad
   optional : grad_grad_out_grad
+  inplace : (grad_grad_x -> fwd_grad_out_grad)
 
 - backward_api : silu_grad
   forward : silu (Tensor x) -> Tensor(out)
@@ -1859,6 +1873,7 @@
     param : [out, out]
   kernel :
     func : sqrt_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : sqrt_grad
   forward : sqrt (Tensor x) -> Tensor(out)
@@ -1881,6 +1896,7 @@
     param : [x, x]
   kernel :
     func : square_double_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : square_grad
   forward : square (Tensor x) -> Tensor(out)
@@ -1946,6 +1962,7 @@
     func : subtract_double_grad
   optional : grad_x_grad, grad_y_grad
   no_need_buffer : y, grad_out
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
@@ -2027,6 +2044,7 @@
   kernel :
     func : tanh_double_grad
   backward : tanh_triple_grad
+  inplace : (grad_x_grad -> grad_out_grad)
 
 - backward_api : tanh_grad
   forward : tanh (Tensor x) -> Tensor(out)
@@ -2060,6 +2078,7 @@
     param : [out, out, grad_x_grad_forward]
   kernel :
     func : tanh_triple_grad
+  inplace : (grad_x_grad_forward -> grad_out_forward_grad)
 
 - backward_api : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)

From 4b89120bf55e48cdc78ceca8c7dadcf349b14060 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.siwek@intel.com>
Date: Tue, 31 May 2022 16:13:41 +0200
Subject: [PATCH 102/109] Remove mkldnn attributes from base ops (#42852)

* remove attrs from base op

* fix typos

* remove brelu

* undo removing code related to matmul

* remove whitespaces

* undo changes in matmul

* remove empty line
---
 .../framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc     | 8 --------
 paddle/fluid/operators/compat/conv2d.pbtxt               | 9 ---------
 paddle/fluid/operators/compat/depthwise_conv2d.pbtxt     | 9 ---------
 paddle/fluid/operators/conv_op.cc                        | 8 --------
 .../contrib/slim/quantization/quant2_int8_mkldnn_pass.py | 7 -------
 .../contrib/slim/tests/test_quant2_int8_mkldnn_pass.py   | 6 +-----
 .../tests/unittests/mkldnn/test_conv2d_mkldnn_op.py      | 2 --
 7 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 55470db312f81..63e402cb52983 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -489,14 +489,6 @@ void QuantDequantMkldnnPass::UpdateActivations(ir::Graph* graph) const {
         std::string activation;
         if (op_desc->GetAttrIfExists<bool>("fuse_relu")) {
           activation = "relu";
-        } else if (op_desc->GetAttrIfExists<bool>("fuse_brelu")) {
-          activation = "relu6";
-          float alpha = 6.0;
-          if (op_desc->HasAttr("fuse_brelu_threshold")) {
-            alpha = BOOST_GET_CONST(float,
-                                    op_desc->GetAttr("fuse_brelu_threshold"));
-          }
-          op_node->Op()->SetAttr("fuse_alpha", alpha);
         }
         op_node->Op()->SetAttr("fuse_activation", activation);
       }
diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt
index ca07d4a36ff3c..8de061a3cc2f6 100644
--- a/paddle/fluid/operators/compat/conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
@@ -77,14 +77,6 @@ extra {
     name: "fuse_relu"
     type: BOOLEAN
   }
-  attrs {
-    name: "fuse_brelu"
-    type: BOOLEAN
-  }
-  attrs {
-    name: "fuse_brelu_threshold"
-    type: FLOAT
-  }
   attrs {
     name: "fuse_activation"
     type: STRING
@@ -134,4 +126,3 @@ extra {
     type: BOOLEAN
   }
 }
-
diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
index ded143986159f..1fbb99c03e833 100644
--- a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt
@@ -69,14 +69,6 @@ extra {
     name: "fuse_relu"
     type: BOOLEAN
   }
-  attrs {
-    name: "fuse_brelu"
-    type: BOOLEAN
-  }
-  attrs {
-    name: "fuse_brelu_threshold"
-    type: FLOAT
-  }
   attrs {
     name: "fuse_activation"
     type: STRING
@@ -126,4 +118,3 @@ extra {
     type: BOOLEAN
   }
 }
-
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 405794783812b..f084862b419d5 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -348,14 +348,6 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false)
       .AsExtra();
-  AddAttr<bool>("fuse_brelu",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false)
-      .AsExtra();
-  AddAttr<float>("fuse_brelu_threshold",
-                 "(float, default false 6.0) Only used in mkldnn kernel")
-      .SetDefault(6.0f)
-      .AsExtra();
   AddAttr<std::string>("fuse_activation",
                        "(string, default \"\") Only used in mkldnn kernel")
       .SetDefault("")
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index e543bc1e17b2c..348d914943521 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -376,13 +376,6 @@ def _update_activations(self, graph):
                 activation = ""
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     activation = "relu"
-                elif op.op().has_attr("fuse_brelu") and op.op().attr(
-                        "fuse_brelu"):
-                    activation = "relu6"
-                    alpha = 6.0
-                    if op.op().has_attr("fuse_brelu_threshold"):
-                        alpha = op.op().attr("fuse_brelu_threshold")
-                    op.set_attr("fuse_alpha", alpha)
                 op.set_attr("fuse_activation", activation)
         return graph
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index f0dae081dd48f..04e1decd4af68 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -177,8 +177,7 @@ def prepare_program_conv2d(self, program):
                 'dilations': self.dilations,
                 'use_cudnn': self.use_cudnn,
                 'use_mkldnn': self.use_mkldnn,
-                'data_format': self.data_format,
-                'fuse_brelu': True
+                'data_format': self.data_format
             })
 
     def remove_fuse_activation_attribute(self, graph):
@@ -196,9 +195,6 @@ def check_graph_after_pass(self, graph):
                 self.assertTrue(op.op().has_attr("fuse_activation"))
                 if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
                     self.assertTrue(op.op().attr("fuse_activation") == "relu")
-                if op.op().has_attr("fuse_brelu") and op.op().attr(
-                        "fuse_brelu"):
-                    self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
     def test_quant_update_activation(self):
         program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 487a69807e2b0..39f55fb45b87b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -61,7 +61,6 @@ def setUp(self):
         self.fuse_activation = ""
         self.fuse_alpha = 0
         self.fuse_beta = 0
-        self.fuse_brelu_threshold = 6.0
         self.fuse_residual_connection = False
         self.input_residual_size = None
 
@@ -99,7 +98,6 @@ def setUp(self):
         self.attrs['fuse_activation'] = self.fuse_activation
         self.attrs['fuse_alpha'] = self.fuse_alpha
         self.attrs['fuse_beta'] = self.fuse_beta
-        self.attrs['fuse_brelu_threshold'] = self.fuse_brelu_threshold
         self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
 
         self.outputs['Output'] = output

From 5f2c251c75b11b6bb311a68482a9bd7fe5107d83 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 1 Jun 2022 10:03:49 +0800
Subject: [PATCH 103/109] [Yaml]add conv3d, depthwise_conv2d yaml (#42807)

* add conv3d yaml

* add conv3d_grad, conv3d_double_grad

* add final_state_conv3d test case

* add conv3d double test case

* add depthwise_conv2d grad yaml

* add depthwise_conv2d double grad test case

* modify the order of args

* add depthwise_conv2d_grad_grad config
---
 .../final_state_generator/codegen_utils.py    |   3 +-
 paddle/phi/api/lib/api_custom_impl.cc         | 203 ++++++++++++++++++
 paddle/phi/api/lib/api_custom_impl.h          |  27 +++
 paddle/phi/kernels/conv_grad_grad_kernel.h    |  10 +-
 .../phi/kernels/cpu/conv_grad_grad_kernel.cc  |  10 +-
 .../kernels/gpudnn/conv_grad_grad_kernel.cu   |  10 +-
 paddle/phi/ops/compat/conv3d_sig.cc           |   4 +-
 .../tests/unittests/test_conv3d_layer.py      |  15 +-
 .../tests/unittests/test_conv_nn_grad.py      |  69 ++++++
 python/paddle/nn/functional/conv.py           |  29 +++
 python/paddle/utils/code_gen/api.yaml         |  17 ++
 python/paddle/utils/code_gen/backward.yaml    |  38 ++++
 12 files changed, 414 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 8467a6d7dfb6a..57681be58ae47 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -30,7 +30,8 @@
     "divide_double_grad", "log_double_grad", "elu_double_grad",
     "leaky_relu_double_grad", "sqrt_double_grad", "rsqrt_double_grad",
     "square_double_grad", "celu_double_grad", "pad_double_grad",
-    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad"
+    "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad",
+    "conv3d_double_grad", "depthwise_conv2d_grad_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 14746abf59494..3ef7763d57e8b 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -531,6 +531,108 @@ Tensor conv2d_impl(const Tensor& input,
   return api_output;
 }
 
+Tensor conv3d_impl(const Tensor& input,
+                   const Tensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  kernel_data_type = ParseDataType(input);
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "conv3d API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "conv3d", {kernel_backend, kernel_layout, kernel_data_type}, true);
+  VLOG(6) << "conv3d API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  phi::TensorArgDef args0 = kernel.InputAt(0);
+  phi::TensorArgDef args1 = kernel.InputAt(1);
+  if (kernel_backend == Backend::GPU) {
+    args0.backend = Backend::GPU;
+    args1.backend = Backend::GPU;
+  }
+
+  auto input_input = PrepareData(input, args0, {});
+  auto input_filter = PrepareData(filter, args1, {});
+
+  Tensor api_output;
+  auto kernel_out = SetKernelOutput(kernel_backend, &api_output);
+  phi::MetaTensor meta_out(kernel_out);
+
+  phi::ConvInferMeta(MakeMetaTensor(*input_input),
+                     MakeMetaTensor(*input_filter),
+                     strides,
+                     paddings,
+                     paddding_algorithm,
+                     groups,
+                     dilations,
+                     data_format,
+                     use_addto,
+                     workspace_size_MB,
+                     exhaustive_search,
+                     &meta_out);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const std::vector<int>&,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    int,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    bool,
+                                    int,
+                                    bool,
+                                    phi::DenseTensor*);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+  {
+    (*kernel_fn)(*dev_ctx,
+                 *input_input,
+                 *input_filter,
+                 strides,
+                 paddings,
+                 paddding_algorithm,
+                 groups,
+                 dilations,
+                 data_format,
+                 use_addto,
+                 workspace_size_MB,
+                 exhaustive_search,
+                 kernel_out);
+  }
+
+  return api_output;
+}
+
 void conv2d_grad_impl(const Tensor& input,
                       const Tensor& filter,
                       const Tensor& out_grad,
@@ -632,6 +734,107 @@ void conv2d_grad_impl(const Tensor& input,
   }
 }
 
+void conv3d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(input, filter, out_grad);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  VLOG(6) << "conv3d_grad API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "conv3d_grad", {kernel_backend, kernel_layout, kernel_data_type}, true);
+  VLOG(6) << "conv3d_grad API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+
+  phi::TensorArgDef args0 = kernel.InputAt(0);
+  phi::TensorArgDef args1 = kernel.InputAt(1);
+  phi::TensorArgDef args2 = kernel.InputAt(2);
+  if (kernel_backend == Backend::GPU) {
+    args0.backend = Backend::GPU;
+    args1.backend = Backend::GPU;
+    args2.backend = Backend::GPU;
+  }
+
+  auto input_input = PrepareData(input, args0, {});
+  auto input_filter = PrepareData(filter, args1, {});
+  auto input_out_grad = PrepareData(out_grad, args2, {});
+
+  auto kernel_out_0 = SetKernelOutput(kernel_backend, input_grad);
+  auto kernel_out_1 = SetKernelOutput(kernel_backend, filter_grad);
+  phi::MetaTensor meta_out_0(kernel_out_0);
+  phi::MetaTensor meta_out_1(kernel_out_1);
+
+  phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input),
+                                  MakeMetaTensor(*input_filter),
+                                  kernel_out_0 ? &meta_out_0 : nullptr,
+                                  kernel_out_1 ? &meta_out_1 : nullptr);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const phi::DenseTensor&,
+                                    const std::vector<int>&,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    int,
+                                    const std::vector<int>&,
+                                    const std::string&,
+                                    bool,
+                                    int,
+                                    bool,
+                                    phi::DenseTensor*,
+                                    phi::DenseTensor*);
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+  {
+    (*kernel_fn)(*dev_ctx,
+                 *input_input,
+                 *input_filter,
+                 *input_out_grad,
+                 strides,
+                 paddings,
+                 paddding_algorithm,
+                 groups,
+                 dilations,
+                 data_format,
+                 use_addto,
+                 workspace_size_MB,
+                 exhaustive_search,
+                 kernel_out_0,
+                 kernel_out_1);
+  }
+}
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index f700345f4699d..22c5d193a2bcd 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -96,6 +96,18 @@ Tensor conv2d_impl(const Tensor& input,
                    int workspace_size_MB,
                    bool exhaustive_search);
 
+Tensor conv3d_impl(const Tensor& input,
+                   const Tensor& filter,
+                   const std::vector<int>& strides,
+                   const std::vector<int>& paddings,
+                   const std::string& paddding_algorithm,
+                   int groups,
+                   const std::vector<int>& dilations,
+                   const std::string& data_format,
+                   bool use_addto,
+                   int workspace_size_MB,
+                   bool exhaustive_search);
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
 Tensor embedding_impl(const Tensor& x,
@@ -148,6 +160,21 @@ void conv2d_grad_impl(const Tensor& input,
                       Tensor* input_grad,
                       Tensor* filter_grad);
 
+void conv3d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad);
+
 void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 void embedding_grad_impl(const Tensor& x,
diff --git a/paddle/phi/kernels/conv_grad_grad_kernel.h b/paddle/phi/kernels/conv_grad_grad_kernel.h
index 799c8721c3cff..f25cbe384c213 100644
--- a/paddle/phi/kernels/conv_grad_grad_kernel.h
+++ b/paddle/phi/kernels/conv_grad_grad_kernel.h
@@ -40,11 +40,11 @@ void ConvGradGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& dev_ctx,
-                          const paddle::optional<DenseTensor>& input_grad_grad,
-                          const paddle::optional<DenseTensor>& filter_grad_grad,
-                          const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings,
                           const std::string& paddding_algorithm,
@@ -54,8 +54,8 @@ void Conv3DGradGradKernel(const Context& dev_ctx,
                           bool use_addto,
                           int workspace_size_MB,
                           bool exhaustive_search,
-                          DenseTensor* out_grad_grad,
                           DenseTensor* input_grad,
-                          DenseTensor* filter_grad);
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
index c52f2614150d8..4538ccf9433f9 100644
--- a/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/conv_grad_grad_kernel.cc
@@ -21,11 +21,11 @@
 namespace phi {
 template <typename T, typename Context>
 void Conv3DGradGradKernel(const Context& ctx,
-                          const paddle::optional<DenseTensor>& input_grad_grad,
-                          const paddle::optional<DenseTensor>& filter_grad_grad,
-                          const DenseTensor& out_grad,
                           const DenseTensor& input,
                           const DenseTensor& filter,
+                          const DenseTensor& out_grad,
+                          const paddle::optional<DenseTensor>& input_grad_grad,
+                          const paddle::optional<DenseTensor>& filter_grad_grad,
                           const std::vector<int>& strides,
                           const std::vector<int>& paddings_t,
                           const std::string& padding_algorithm,
@@ -35,9 +35,9 @@ void Conv3DGradGradKernel(const Context& ctx,
                           bool use_addto,
                           int workspace_size_MB,
                           bool exhaustive_search_t,
-                          DenseTensor* out_grad_grad,
                           DenseTensor* input_grad,
-                          DenseTensor* filter_grad) {
+                          DenseTensor* filter_grad,
+                          DenseTensor* out_grad_grad) {
   ConvGradGradKernel<T>(ctx,
                         input,
                         filter,
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index b396e8fa6b0eb..53e4c39d8bcee 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -711,11 +711,11 @@ void DepthwiseConvCudnnGradGradKernel(
 template <typename T, typename Context>
 void Conv3DCudnnGradGradKernel(
     const Context& ctx,
-    const paddle::optional<DenseTensor>& input_grad_grad,
-    const paddle::optional<DenseTensor>& filter_grad_grad,
-    const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
+    const DenseTensor& out_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const std::vector<int>& strides,
     const std::vector<int>& paddings_t,
     const std::string& padding_algorithm,
@@ -725,9 +725,9 @@ void Conv3DCudnnGradGradKernel(
     bool use_addto,
     int workspace_size_MB,
     bool exhaustive_search_t,
-    DenseTensor* out_grad_grad,
     DenseTensor* input_grad,
-    DenseTensor* filter_grad) {
+    DenseTensor* filter_grad,
+    DenseTensor* out_grad_grad) {
   ConvCudnnGradGradKernel<T>(ctx,
                              input,
                              filter,
diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc
index c6aae1bf5bb54..49f31288d00f6 100644
--- a/paddle/phi/ops/compat/conv3d_sig.cc
+++ b/paddle/phi/ops/compat/conv3d_sig.cc
@@ -49,7 +49,7 @@ KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 KernelSignature Conv3dDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("conv3d_grad_grad",
-                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
                          {"strides",
                           "paddings",
                           "padding_algorithm",
@@ -59,7 +59,7 @@ KernelSignature Conv3dDoubleGradOpArgumentMapping(
                           "use_addto",
                           "workspace_size_MB",
                           "exhaustive_search"},
-                         {"DDOutput", "DInput", "DFilter"});
+                         {"DInput", "DFilter", "DDOutput"});
 }
 
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index 707991352fa5e..dd6dcf6d5e9ae 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -17,6 +17,8 @@
 import paddle.fluid.dygraph as dg
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
+import paddle
+from paddle.fluid.framework import _test_eager_guard
 import unittest
 
 
@@ -134,7 +136,8 @@ def functional(self, place):
         return y_np
 
     def paddle_nn_layer(self):
-        x_var = dg.to_variable(self.input)
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
         conv = nn.Conv3D(
             self.num_channels,
             self.num_filters,
@@ -148,17 +151,23 @@ def paddle_nn_layer(self):
         if not self.no_bias:
             conv.bias.set_value(self.bias)
         y_var = conv(x_var)
+        y_var.backward()
         y_np = y_var.numpy()
-        return y_np
+        t1 = x_var.gradient()
+        return y_np, t1
 
     def _test_equivalence(self, place):
         place = fluid.CPUPlace()
         result1 = self.fluid_layer(place)
         result2 = self.functional(place)
         with dg.guard(place):
-            result3 = self.paddle_nn_layer()
+            result3, g1 = self.paddle_nn_layer()
+            with _test_eager_guard():
+                res_eager, g2 = self.paddle_nn_layer()
         np.testing.assert_array_almost_equal(result1, result2)
         np.testing.assert_array_almost_equal(result2, result3)
+        self.assertTrue(np.allclose(result3, res_eager))
+        self.assertTrue(np.allclose(g1, g2))
 
     def runTest(self):
         place = fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 784d89b93f985..5bff8b3142106 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -503,6 +503,75 @@ def test_grad(self):
             self.func(p)
 
 
+class TestDepthWiseConvDoubleGradCheckCase1(unittest.TestCase):
+    def depthwise_conv2d_wrapper(self, x):
+        return paddle.nn.functional.conv2d(x[0], x[1], groups=4)
+
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 4, 3, 3]
+        w_shape = [4, 1, 3, 3]
+        eps = 0.005
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
+        x = layers.data('x', x_shape, False, dtype)
+        w = layers.data('w', w_shape, False, dtype)
+
+        # condition of depthwise conv: 
+        # use_cudnn == False
+        # groups == filters
+        # num_filters % num_channels == 0
+
+        y = paddle.nn.functional.conv2d(x, w, groups=4)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.depthwise_conv2d_wrapper, [x, w],
+            y,
+            x_init=[x_arr, w_arr],
+            place=place)
+
+    def test_grad(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestConv3DDoubleGradCheck_NN(unittest.TestCase):
+    def conv3d_wrapper(self, x):
+        return paddle.nn.functional.conv3d(x[0], x[1])
+
+    @prog_scope()
+    def func(self, place):
+        x_shape = [2, 3, 8, 8, 8]
+        w_shape = [6, 3, 3, 3, 3]
+        eps = 0.005
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
+        x = layers.data('x', x_shape, False, dtype)
+        w = layers.data('w', w_shape, False, dtype)
+        x.persistable = True
+        w.persistable = True
+        y = paddle.nn.functional.conv3d(x, w)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+        w_arr = np.random.uniform(-1, 1, w_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x, w], y, x_init=[x_arr, w_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv3d_wrapper, [x, w], y, x_init=[x_arr, w_arr], place=place)
+
+    def test_grad(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 6c7f09091ff3c..419014daf64e4 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -138,6 +138,35 @@ def _conv_nd(x,
                 return _C_ops.final_state_add(pre_bias, bias)
         else:
             return pre_bias
+
+    if in_dygraph_mode() and op_type == "depthwise_conv2d":
+        pre_bias = _C_ops.final_state_depthwise_conv2d(
+            x, weight, stride, padding, padding_algorithm, groups, dilation,
+            data_format, False, -1, False, False)
+        if bias is not None:
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            tmp_bias = _C_ops.final_state_reshape(
+                bias, bias.shape +
+                [1 for i in range(len(x.shape) - channel_dim - 1)])
+            return _C_ops.final_state_add(pre_bias, tmp_bias)
+        else:
+            return pre_bias
+
+    if in_dygraph_mode() and op_type == "conv3d":
+        pre_bias = _C_ops.final_state_conv3d(
+            x, weight, stride, padding, padding_algorithm, groups, dilation,
+            data_format, False, -1, False)
+        if bias is not None:
+            channel_dim = channel_dim + len(
+                x.shape) if channel_dim < 0 else channel_dim
+            tmp_bias = _C_ops.final_state_reshape(
+                bias, bias.shape +
+                [1 for i in range(len(x.shape) - channel_dim - 1)])
+            return _C_ops.final_state_add(pre_bias, tmp_bias)
+        else:
+            return pre_bias
+
     if in_dynamic_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index c3a8e68ca7b0b..44865940adb44 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -407,6 +407,12 @@
     use_gpudnn : true
   backward : conv2d_transpose_grad
 
+- api : conv3d
+  args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor
+  invoke : conv3d_impl(input, filter, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  backward : conv3d_grad
+
 - api : conv3d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
@@ -492,6 +498,17 @@
   optional : mask
   backward : deformable_conv_grad
 
+- api : depthwise_conv2d
+  args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  output : Tensor(out)
+  invoke : conv2d_impl(x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  backward : depthwise_conv2d_grad
+  # infer_meta :
+  #   func : ConvTransposeInferMeta
+  #   prams: [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search]
+  # kernel :
+  #   func : depthwise_conv2d
+
 - api : depthwise_conv2d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index e6e26d7e5ecac..d6c148e6ca925 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -392,6 +392,25 @@
     use_gpudnn : true
   backward : conv2d_transpose_double_grad
 
+- backward_api : conv3d_grad
+  forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad)
+  invoke : conv3d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  backward : conv3d_grad_grad
+
+- backward_api : conv3d_grad_grad
+  forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [input, filter, grad_out]
+  kernel :
+    func : conv3d_grad_grad
+    use_gpudnn : true
+  optional : grad_input_grad, grad_filter_grad
+
 - backward_api : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -475,6 +494,25 @@
     data_type : x
   optional : mask
 
+- backward_api : depthwise_conv2d_grad
+  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  output : Tensor(input_grad), Tensor(filter_grad)
+  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  backward : depthwise_conv2d_grad_grad
+
+- backward_api : depthwise_conv2d_grad_grad
+  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param: [input, filter, grad_out]
+  kernel :
+    func : conv2d_grad_grad
+    use_gpudnn : true
+  optional : grad_input_grad, grad_filter_grad
+
 - backward_api : depthwise_conv2d_transpose_grad
   forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)

From 010aba33ee5655555ce1e9bf92e9596828d446ae Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Wed, 1 Jun 2022 10:18:26 +0800
Subject: [PATCH 104/109] [Auto Parallel] Add miscellaneous improvements
 (#43108)

* [Auto Parallel] Add the parallel tuner

* [Auto Parallel] Improve the parallel tuner and fix some bugs

* upodate cost model

* update import Resharder by dist op

* update cost model

* fix comp cost bug

* update cost model

* [Auto Parallel] Amend the dist attr for #processses=1

* update cost model and tuner

* update cost model and tuner

* update cost model and tuner

* update cluster

* update reshard

* [Auto Parallel] Add the estimation from the cost model

* [Auto Parallel] Reimplement the backup and restore functions

* [Auto Parallel] Fix the bugs of the parallel tuner

* [Auto Parallel] Update the engine api and dist context

* [Auto Parallel] Work around the high order grad problem

* [Auto Parallel] Add some miscellaneous improvements

* [Auto Parallel] Add a unittest for DistributedContext

Co-authored-by: caozhou <caozhou@radi.ac.cn>
---
 .../distributed/auto_parallel/completion.py   |  77 +++--
 .../auto_parallel/dist_attribute.py           |  49 +--
 .../distributed/auto_parallel/dist_context.py | 303 +++++++++++++-----
 .../distributed/auto_parallel/dist_tensor.py  |   7 +-
 .../distributed/auto_parallel/engine.py       |  34 +-
 .../auto_parallel/operators/__init__.py       |   2 +-
 .../auto_parallel/operators/common.py         |   4 +-
 .../auto_parallel/operators/dist_default.py   |   4 +-
 .../auto_parallel/operators/dist_pnorm.py     |   3 +-
 .../auto_parallel/parallelizer_v2.py          |   6 +-
 .../distributed/auto_parallel/planner_v2.py   |  13 +-
 .../paddle/distributed/auto_parallel/utils.py |  11 +-
 .../unittests/auto_parallel/CMakeLists.txt    |   1 +
 .../auto_parallel/test_dist_context.py        | 204 ++++++++++++
 .../auto_parallel/test_dist_slice.py          |   3 +-
 .../auto_parallel/test_while_op_completion.py |   2 +-
 .../auto_parallel/test_while_op_partition.py  |   2 +-
 17 files changed, 574 insertions(+), 151 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py

diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 03996ec350da4..465c450c0b076 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -20,7 +20,7 @@
 from paddle.fluid import framework
 
 from .utils import print_program_with_dist_attr
-from .operators import find_best_compatible_distributed_operator_impl
+from .operators import find_compatible_distributed_operator_impls
 from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
@@ -238,13 +238,17 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                             tensor_desc.name())
                         compatible_dims_mapping = compute_compatible_dims_mapping(
                             [op_dims_mapping, tensor_dims_mapping])
+                        if not _validate_dims_mapping(
+                                compatible_dims_mapping,
+                                op_dist_attr.process_mesh):
+                            continue
                         if (compatible_dims_mapping is not None) and \
                             (compatible_dims_mapping != op_dims_mapping):
                             op_dist_attr.set_input_dims_mapping(
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_best_compatible_distributed_operator_impl(
+            op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=True)
             if op_dist_impls is not None:
                 not_compatible = True
@@ -254,7 +258,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                     dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                     if dim_changed:
                         changed = True
-                    if op_dist_impl.is_auto_compatible(dist_op):
+                    if op_dist_impl.is_auto_compatible(dist_op) \
+                        and dist_op.validate_dist_attr():
                         if op_dist_impl.type == "elementwise":
                             op_dist_attr.impl_type = "default"
                         else:
@@ -289,13 +294,17 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                             tensor_desc.name())
                         compatible_dims_mapping = compute_compatible_dims_mapping(
                             [op_dims_mapping, tensor_dims_mapping])
+                        if not _validate_dims_mapping(
+                                compatible_dims_mapping,
+                                op_dist_attr.process_mesh):
+                            continue
                         if (compatible_dims_mapping is not None) and \
                             (compatible_dims_mapping != op_dims_mapping):
                             op_dist_attr.set_output_dims_mapping(
                                 tensor_desc.name(), compatible_dims_mapping)
                             changed = True
             # Find the most compatible implemenetations from the distributed operator
-            op_dist_impls = find_best_compatible_distributed_operator_impl(
+            op_dist_impls = find_compatible_distributed_operator_impls(
                 dist_op, fwd=False)
             if op_dist_impls is not None:
                 not_compatible = True
@@ -305,8 +314,8 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
                     dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                     if dim_changed:
                         changed = True
-                    if op_dist_impl.is_auto_compatible(dist_op):
-                        not_compatible = False
+                    if op_dist_impl.is_auto_compatible(dist_op) \
+                        and dist_op.validate_dist_attr():
                         if op_dist_impl.type == "elementwise":
                             op_dist_attr.impl_type = "default"
                         else:
@@ -352,6 +361,23 @@ def _update_dims_mapping_between_graphs(self):
                 changed = True
         return changed
 
+    def _update_dims_mapping_for_special(self):
+        # Set the dims_mapping of a tensor to the dims_mapping inside the op which produces it
+        op_nodes = self._dist_context._serial_ordered_op_nodes
+        for op_node in op_nodes:
+            op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
+            for tensor_node in op_node.outputs:
+                if tensor_node.is_var() and tensor_node.var() is not None:
+                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                        continue
+                    tensor_desc = tensor_node.var()
+                    tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                        tensor_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        op_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            tensor_desc.name())
+                        tensor_dist_attr.dims_mapping = op_dims_mapping
+
     def _update_dims_mapping(self):
         # Complete dims_mapping for each node
         reach_fix_point = False
@@ -378,6 +404,7 @@ def _update_dims_mapping(self):
                 reach_fix_point = False
             else:
                 reach_fix_point = True
+        self._update_dims_mapping_for_special()
 
     def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
         op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
@@ -685,7 +712,7 @@ def _update_process_mesh(self):
         # Step 3: adjust the process meshes for special ops
         self._update_process_mesh_for_specials()
 
-        # Step 4: adjust the process meshes between graphs 
+        # Step 4: adjust the process meshes between graphs
         self._update_process_mesh_between_graphs()
 
     def _prepare(self):
@@ -727,14 +754,14 @@ def complete_forward_annotation(self, serial_main_program=None):
         """ Complete annotation for the partial annotated serial_main_program.
         Arguments:
             serial_main_program: partial annotated serial_main_program.
-        Returns:
+        Returns:e
             serial_main_program: completed annotated serial_main_program.
         """
 
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         self._dist_context.initialize()
 
@@ -757,13 +784,18 @@ def complete_forward_annotation(self, serial_main_program=None):
 
         return serial_main_program
 
-    def _complete_high_order_grad_annotation(self, serial_main_program):
+    def _complete_high_order_grad_annotation(self, serial_main_program=None):
         """
         NOTE: 
             [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
             This function is temporary to support high order gradient, and will be removed in the future.
         """
 
+        if serial_main_program is None:
+            serial_main_program = self._dist_context.serial_main_program
+        else:
+            self._dist_context._serial_main_program = serial_main_program
+
         def _is_grad_var_name(name):
             if "@GRAD" in name:
                 return True
@@ -917,12 +949,13 @@ def _get_op_by_id(ops, id):
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-    def complete_backward_annotation(self, serial_main_program):
+    def complete_backward_annotation(self, serial_main_program=None):
         """Complete the annotation of vars and ops in the backward phase for parallel program."""
+
         if serial_main_program is None:
             serial_main_program = self._dist_context.serial_main_program
         else:
-            self._dist_context.serial_main_program = serial_main_program
+            self._dist_context._serial_main_program = serial_main_program
 
         def _is_grad_var_name(name):
             if "@GRAD" in name:
@@ -1032,6 +1065,9 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.process_mesh = ref_mesh
                     self._dist_context.set_op_dist_attr_for_program(
                         grad_op, grad_op_dist_attr)
+                    grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
+                    grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
+
                     continue
 
                 fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
@@ -1078,6 +1114,8 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.set_output_dims_mapping(output_name,
                                                               ref_dims_mapping)
 
+                grad_op_dist_attr.impl_type = fwd_op_dist_attr.impl_type
+                grad_op_dist_attr.impl_idx = fwd_op_dist_attr.impl_idx
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
@@ -1111,6 +1149,8 @@ def _get_op_by_id(ops, id):
                             var_name, ref_fwd_dims_mapping)
                     grad_op_dist_attr.set_output_dims_mapping(
                         output_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.impl_type = "default"
+                    grad_op_dist_attr.impl_idx = 0
 
                 elif grad_op.type == 'fill_zeros_like':
                     ref_var_name = grad_op.input_arg_names[0]
@@ -1142,12 +1182,13 @@ def _get_op_by_id(ops, id):
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-    def complete_update_annotation(self, serial_main_program=None):
+    def complete_update_annotation(self, serial_main_program):
         """Complete the annotation of vars and ops in the update phase for parallel program."""
-        if serial_main_program is None:
-            serial_main_program = self._dist_context.serial_main_program
-        else:
-            self._dist_context.serial_main_program = serial_main_program
+
+        # Notice: serial_main_program is actually a dist_main_program of current rank,
+        # and must be passed into this function. 
+        # TODO: We should fix this behavior.
+
         ops = list(serial_main_program.global_block().ops)
         vars = serial_main_program.global_block().vars
         learning_rate_completed = False
@@ -1304,7 +1345,7 @@ def _init_global_mesh_for_program(self):
                 dist_op.dist_attr.process_mesh = world_ranks
 
                 # Find the most compatible implemenetations from the distributed operator
-                op_dist_impls = find_best_compatible_distributed_operator_impl(
+                op_dist_impls = find_compatible_distributed_operator_impls(
                     dist_op, fwd=True)
                 if op_dist_impls is not None:
                     backup_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 6fa5b756c75c3..3dbdb79f48541 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -132,15 +132,17 @@ def init(self, dist_attr):
                         key, dist_attr)
             self._is_annotated = copy.deepcopy(dist_attr._is_annotated)
 
-    # def reset(self, skip_dist_attr_field_names):
-    #     if skip_dist_attr_field_names is not None \
-    #         and "process_mesh" not in skip_dist_attr_field_names:
-    #         self._process_mesh = None
-    #     if skip_dist_attr_field_names is not None \
-    #         and "dims_mapping" not in skip_dist_attr_field_names:
-    #         for i in enumerate(self._dims_mapping):
-    #             self._dims_mapping[i] = -1
-    #     self._is_annotated = {}
+    def reset(self, skip_dist_attr_field_names=None):
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "process_mesh" not in skip_dist_attr_field_names):
+            self._process_mesh = None
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "dims_mapping" not in skip_dist_attr_field_names):
+            for i, _ in enumerate(self._dims_mapping):
+                self._dims_mapping[i] = -1
+        self._is_annotated = {}
 
     def is_annotated(self, dist_attr_field_name):
         return self._is_annotated.get(dist_attr_field_name, False)
@@ -272,6 +274,9 @@ def set_input_dist_attr(self, name, dist_attr):
         dist_attr_object.init(dist_attr)
         self._inputs_dist_attrs[name] = dist_attr_object
 
+    # def del_input_dist_attr(self, name):
+    #     del self._inputs_dist_attrs[name]
+
     def get_output_dist_attr(self, name):
         return self._outputs_dist_attrs.get(name, None)
 
@@ -280,6 +285,9 @@ def set_output_dist_attr(self, name, dist_attr):
         dist_attr_object.init(dist_attr)
         self._outputs_dist_attrs[name] = dist_attr_object
 
+    # def del_output_dist_attr(self, name):
+    #     del self._inputs_dist_attrs[name]
+
     def get_input_dims_mapping(self, name):
         input_dist_attr = self.get_input_dist_attr(name)
         if input_dist_attr:
@@ -374,17 +382,18 @@ def init(self, dist_attr):
                         "ProcessMeshes in DistributedOperator must be the same."
         self.process_mesh = shared_process_mesh
 
-    # def reset(self, skip_dist_attr_field_names):
-    #     for tensor_dist_attr in self.inputs_dist_attrs.values():
-    #         tensor_dist_attr.reset(skip_dist_attr_field_names)
-    #     for tensor_dist_attr in self.outputs_dist_attrs.values():
-    #         tensor_dist_attr.reset(skip_dist_attr_field_names)
-    #     if skip_dist_attr_field_names is not None \
-    #         and "process_mesh" not in skip_dist_attr_field_names:
-    #         self.process_mesh = None
-    #     self.impl_type = "default"
-    #     self.impl_idx = 0
-    #     self._is_annotated = {}
+    def reset(self, skip_dist_attr_field_names=None):
+        for tensor_dist_attr in self.inputs_dist_attrs.values():
+            tensor_dist_attr.reset(skip_dist_attr_field_names)
+        for tensor_dist_attr in self.outputs_dist_attrs.values():
+            tensor_dist_attr.reset(skip_dist_attr_field_names)
+        if skip_dist_attr_field_names is None or \
+            (skip_dist_attr_field_names is not None \
+                and "process_mesh" not in skip_dist_attr_field_names):
+            self._process_mesh = None
+        self.impl_type = "default"
+        self.impl_idx = 0
+        self._is_annotated = {}
 
     def is_annotated(self, attr_name):
         return self._is_annotated.get(attr_name, False)
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index a47ef66ee848a..6a38b53cf2c10 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -57,33 +57,30 @@ def __init__(self,
                  serial_startup_prog=None,
                  serial_optimizer=None,
                  serial_loss=None,
-                 feed_vars=None,
-                 fetch_vars=None,
+                 feed_vars={},
+                 fetch_vars={},
+                 cluster=None,
                  strategy=None):
         # Data members related to original programs (unchanged)
         self._original_serial_main_program = serial_main_prog
         self._original_serial_startup_program = serial_startup_prog
+        self._original_serial_optimizer = serial_optimizer
         self._original_serial_loss = serial_loss
+        self._original_serial_feed_vars = feed_vars
+        self._original_serial_fetch_vars = fetch_vars
         self._original_serial_optimizer = serial_optimizer
-        if self._original_serial_main_program is None:
-            self._original_serial_main_program = paddle.fluid.default_main_program(
-            )
-        if self._original_serial_startup_program is None:
-            self._original_serial_startup_program = paddle.fluid.default_startup_program(
-            )
 
         # Data members related to programs (changed)
         self._serial_main_program = None
         self._serial_startup_program = None
-        self._serial_loss = serial_loss
-        self._serial_optimizer = serial_optimizer
-        self._serial_feed_vars = feed_vars
-        self._serial_fetch_vars = fetch_vars
+        self._serial_loss = None
+        self._serial_optimizer = None
+        self._serial_feed_vars = {}
+        self._serial_fetch_vars = {}
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
-        self._block_state = BlockState()
 
         # Data members related to the graph
         self._serial_graph = None
@@ -96,24 +93,30 @@ def __init__(self,
         # Distributed programs
         self._dist_main_programs = {}
         self._dist_startup_programs = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._process_meshes = []
 
-        # Distributed Strategy
+        self._cluster = cluster
         self._strategy = strategy
 
         # Pass Context
         self._pass_context = PassContext()
-
-        # Distributed Operator Context
-        self._dist_op_context = DistributedOperatorContext()
+        self._block_state = BlockState()
 
         # Other data members
-        self._process_meshes = []
         self._serial_ordered_tensor_nodes = []
         self._serial_ordered_op_nodes = []
         self._serial_ordered_nodes = []
         # self._tensor_id_to_tensor_node_ids = {}
 
         self._is_initialized = False
+        self._need_copy_dist_attr_to_graph = False
+        self._backup_pass_context_stack = []
+        self._backup_block_state_stack = []
+        self._backup_dist_tensors_for_program_stack = []
+        self._backup_dist_ops_for_program_stack = []
+        self._backup_serial_main_program_stack = []
+        self._backup_serial_startup_program_stack = []
 
         # flag whether scale gradient with dp size
         self._gradient_scale = True
@@ -122,13 +125,6 @@ def __init__(self,
     def serial_main_program(self):
         return self._serial_main_program
 
-    @serial_main_program.setter
-    def serial_main_program(self, program):
-        # if self._serial_main_program:
-        #     print("WARNING: The program attached to this distributed context will be replaced by the new one.")
-        self._original_serial_main_program = program
-        self._serial_main_program = program
-
     @property
     def serial_startup_program(self):
         return self._serial_startup_program
@@ -149,6 +145,18 @@ def serial_feed_vars(self):
     def serial_fetch_vars(self):
         return self._serial_fetch_vars
 
+    @property
+    def dist_main_programs(self):
+        return self._dist_main_programs
+
+    @property
+    def dist_startup_programs(self):
+        return self._dist_startup_programs
+
+    @property
+    def cluster(self):
+        return self._cluster
+
     @property
     def strategy(self):
         return self._strategy
@@ -177,14 +185,6 @@ def dist_op_context(self):
     def block_state(self):
         return self._block_state
 
-    @property
-    def dist_main_programs(self):
-        return self._dist_main_programs
-
-    @property
-    def dist_startup_programs(self):
-        return self._dist_startup_programs
-
     @property
     def has_annotation(self):
         return len(self._dist_tensors_for_program) or len(
@@ -198,21 +198,168 @@ def gradient_scale(self):
     def gradient_scale(self, gs):
         self._gradient_scale = gs
 
-    def initialize(self):
-        if not self._is_initialized:
+    def _backup_serial_info(self, mode):
+        self._backup_serial_main_program_stack.append(
+            self._serial_main_program.clone())
+        self._backup_serial_startup_program_stack.append(
+            self._serial_startup_program.clone())
+        self._backup_pass_context_stack.append(
+            copy.deepcopy(self._pass_context))
+        self._backup_block_state_stack.append(copy.deepcopy(self._block_state))
+
+    def _backup_dist_info(self, mode):
+        self._backup_dist_tensors_for_program_stack.append(
+            copy.deepcopy(self._dist_tensors_for_program))
+        self._backup_dist_ops_for_program_stack.append(
+            copy.deepcopy(self._dist_ops_for_program))
+
+    def _backup(self, serial=True, serial_mode=None, dist=True, dist_mode=None):
+        # Use this function carefully
+        if serial:
+            self._backup_serial_info(serial_mode)
+        if dist:
+            self._backup_dist_info(dist_mode)
+
+    def _restore_serial_info(self, mode="to_backup"):
+        if mode == "to_backup":
+            self._serial_main_program = self._backup_serial_main_program_stack.pop(
+            )
+            self._serial_startup_program = self._backup_serial_startup_program_stack.pop(
+            )
+        elif mode == "to_original":
+            assert self._original_serial_main_program is not None
+            assert self._original_serial_startup_program is not None
             self._serial_main_program = self._original_serial_main_program.clone(
             )
             self._serial_startup_program = self._original_serial_startup_program.clone(
             )
-            # self._serial_main_program = self._original_serial_main_program
-            # self._serial_startup_program = self._original_serial_startup_program
-            if self._original_serial_loss:
-                self._serial_loss = self._serial_main_program.global_block(
-                ).vars[self._original_serial_loss[0].name]
+
+        self._serial_optimizer = self._original_serial_optimizer
+
+        if self._original_serial_loss:
+            if isinstance(self._original_serial_loss, list):
+                assert len(self._original_serial_loss) == 1
+                loss = self._original_serial_loss[0]
+                block_idx = loss.block.idx
+                var_name = loss.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                self._serial_loss = var
             else:
-                self._serial_loss = self._original_serial_loss
-            self._serial_optimizer = self._original_serial_optimizer
+                block_idx = self._original_serial_loss.block.idx
+                var_name = self._original_serial_loss.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                self._serial_loss = var
+
+        for key, var_list in self._original_serial_feed_vars.items():
+            new_var_list = []
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
+            self._serial_feed_vars[key] = new_var_list
+
+        for key, var_list in self._original_serial_fetch_vars.items():
+            new_var_list = []
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = self._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
+            self._serial_fetch_vars[key] = new_var_list
+
+        self._pass_context = self._backup_pass_context_stack.pop()
+        self._block_state = self._backup_block_state_stack.pop()
+
+    def _restore_dist_info(self, mode="to_backup"):
+        if mode == "to_backup":
+            self._dist_tensors_for_program = self._backup_dist_tensors_for_program_stack.pop(
+            )
+            self._dist_ops_for_program = self._backup_dist_ops_for_program_stack.pop(
+            )
+        elif mode == "to_original":
+            assert self._original_dist_tensors_for_program
+            assert self._original_dist_ops_for_program
+            self._dist_tensors_for_program = copy.deepcopy(
+                self._original_dist_tensors_for_program)
+            self._dist_ops_for_program = copy.deepcopy(
+                self._original_dist_ops_for_program)
+        elif mode == "to_default":
+            new_tensors_ids = []
+            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
+            ):
+                if tensor_id in self._tensors_ids:
+                    dist_tensor.dist_attr.reset()
+                else:
+                    new_tensors_ids.append(tensor_id)
+            for tensor_id in new_tensors_ids:
+                self._dist_tensors_for_program.pop(tensor_id)
+            new_ops_ids = []
+            for op_id, dist_op in self._dist_ops_for_program.items():
+                if op_id in self._ops_ids:
+                    dist_op.dist_attr.reset()
+                else:
+                    new_ops_ids.append(op_id)
+            for op_id in new_ops_ids:
+                self._dist_ops_for_program.pop(op_id)
+        else:
+            new_tensors_ids = []
+            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
+            ):
+                new_tensors_ids.append(tensor_id)
+            for tensor_id in new_tensors_ids:
+                self._dist_tensors_for_program.pop(tensor_id)
+            new_ops_ids = []
+            for op_id, dist_op in self._dist_ops_for_program.items():
+                new_ops_ids.append(op_id)
+            for op_id in new_ops_ids:
+                self._dist_ops_for_program.pop(op_id)
+        self._dist_main_programs = {}
+        self._dist_startup_programs = {}
+        self._dist_op_context = DistributedOperatorContext()
+        self._need_copy_dist_attr_to_graph = True
+        self._process_meshes = []
+
+    def _restore(self,
+                 serial=True,
+                 serial_mode="to_backup",
+                 dist=True,
+                 dist_mode="to_backup"):
+        # Use this function carefully
+        if serial:
+            self._restore_serial_info(serial_mode)
+        if dist:
+            self._restore_dist_info(dist_mode)
+
+    def initialize(self):
+        if not self._is_initialized:
+            if not self._serial_main_program:
+                self._serial_main_program = self._original_serial_main_program
+            if not self._serial_startup_program:
+                self._serial_startup_program = self._original_serial_startup_program
+            if not self._serial_loss:
+                if isinstance(self._original_serial_loss, list):
+                    assert len(self._original_serial_loss) == 1
+                    self._serial_loss = self._original_serial_loss[0]
+                else:
+                    self._serial_loss = self._original_serial_loss
+            if not self._serial_optimizer:
+                self._serial_optimizer = self._original_serial_optimizer
+            if not self._serial_feed_vars:
+                self._serial_feed_vars = self._original_serial_feed_vars
+            if not self._serial_fetch_vars:
+                self._serial_fetch_vars = self._original_serial_fetch_vars
+
             self._init_dist_attr_for_program()
+            # Backup the original distributed information for later restore
+            self._original_dist_tensors_for_program = copy.deepcopy(
+                self._dist_tensors_for_program)
+            self._original_dist_ops_for_program = copy.deepcopy(
+                self._dist_ops_for_program)
             self._tensors_ids = list(self._dist_tensors_for_program.keys())
             self._ops_ids = list(self._dist_ops_for_program.keys())
             set_flags({"FLAGS_convert_all_blocks": True})
@@ -220,41 +367,9 @@ def initialize(self):
                 core.Graph(self._serial_main_program.desc))
             self._init_dist_attr_for_graph()
             self._is_initialized = True
-
-    # def reset(self,
-    #           skip_dist_tensors=None,
-    #           skip_dist_ops=None,
-    #           skip_tensor_dist_attr_fields=None,
-    #           skip_op_dist_attr_fields=None):
-    #     self._serial_main_program = self._original_serial_main_program.clone()
-    #     self._serial_startup_program = self._original_serial_startup_program.clone()
-    #     new_tensors_ids = []
-    #     for tensor_id, dist_tensor in self._dist_tensors_for_program.items():
-    #         if tensor_id in self._tensors_ids:
-    #             dist_tensor.dist_attr.reset(skip_tensor_dist_attr_fields)
-    #         else:
-    #             new_tensors_ids.append(tensor_id)
-    #     for tensor_id in new_tensors_ids:
-    #         self._dist_tensors_for_program.pop(tensor_id)
-    #     new_ops_ids = []
-    #     for op_id, dist_op in self._dist_ops_for_program.items():
-    #         if op_id in self._ops_ids:
-    #             dist_op.dist_attr.reset(skip_op_dist_attr_fields)
-    #         else:
-    #             new_ops_ids.append(op_id)
-    #     for op_id in new_ops_ids:
-    #         self._dist_ops_for_program.pop(op_id)
-
-    #     self.copy_dist_attr_from_program_to_graph()
-
-    #     self._dist_main_programs = {}
-    #     self._dist_startup_programs = {}
-
-    #     self._pass_context = PassContext()
-
-    #     self._dist_op_context = DistributedOperatorContext()
-
-    #     self._process_meshes = []
+            self._need_copy_dist_attr_to_graph = False
+        if self._need_copy_dist_attr_to_graph:
+            self.copy_dist_attr_from_program_to_graph()
 
     def add_process_mesh(self, process_mesh):
         assert isinstance(process_mesh, ProcessMesh), \
@@ -423,6 +538,10 @@ def _init_dist_attr_for_program(self, no_default=False):
                 if current_dist_op is None:
                     dist_op = DistributedOperator(op)
                     self.add_dist_op_for_program(dist_op)
+        self._original_dist_tensors_for_program = copy.deepcopy(
+            self._dist_tensors_for_program)
+        self._original_dist_ops_for_program = copy.deepcopy(
+            self._dist_ops_for_program)
 
     def _order_nodes_by_program_order(self):
         def _contains(nodes, target_node):
@@ -592,7 +711,7 @@ def copy_dist_attr_from_graph_to_program(self):
                 op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node)
                 dist_op_for_program = self._dist_ops_for_program[op_id]
                 dist_op_for_program.dist_attr = op_dist_attr_for_graph
-        # TODO: the completion algorithm will skip orphan tensors,
+        # TODO: the completion algorithm will skipped orphan tensors,
         # here we just set there process_mesh to the first one.
         for orphan_node in self._serial_orphan_tensor_nodes:
             serial_tensor_id = orphan_node.var().id()
@@ -618,16 +737,21 @@ def amend_dist_attr_for_program(self):
                 tensor_shape = serial_tensor.shape
             dims_mapping = dist_attr.dims_mapping
             process_mesh_shape = dist_attr.process_mesh.topology
+            process_mesh_processes = dist_attr.process_mesh.processes
             # If the dimension of tensor is less than the sharding dimension of process mesh,
             # we just amend the dimension mapping to -1. (Is this really OK?)
             for i in range(len(tensor_shape)):
                 if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                     and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                     dims_mapping[i] = -1
+                if dims_mapping[i] != -1 and len(process_mesh_processes) == 1:
+                    dims_mapping[i] = -1
 
         for dist_op in self._dist_ops_for_program.values():
             serial_op = dist_op.serial_op
             dist_attr = dist_op.dist_attr
+            process_mesh_shape = dist_attr.process_mesh.topology
+            process_mesh_processes = dist_attr.process_mesh.processes
             for arg_name in serial_op.input_arg_names:
                 if dist_op.get_serial_input(arg_name) is None:
                     tensor_shape = []
@@ -639,13 +763,15 @@ def amend_dist_attr_for_program(self):
                     else:
                         tensor_shape = dist_op.get_serial_input(arg_name).shape
                 dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
-                process_mesh_shape = dist_attr.process_mesh.topology
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
                     if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
+                    if dims_mapping[i] != -1 and len(
+                            process_mesh_processes) == 1:
+                        dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
                 if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
                     or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
@@ -654,13 +780,18 @@ def amend_dist_attr_for_program(self):
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
                 dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
-                process_mesh_shape = dist_attr.process_mesh.topology
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
                     if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
                         and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
+                    if dims_mapping[i] != -1 and len(
+                            process_mesh_processes) == 1:
+                        dims_mapping[i] = -1
+            if len(process_mesh_processes) == 1:
+                dist_op.dist_attr.impl_type = "default"
+                dist_op.dist_attr.impl_idx = 0
 
     def validate_dist_attr_for_program(self):
         if not self._is_initialized:
@@ -674,16 +805,20 @@ def validate_dist_attr_for_program(self):
                         dist_tensor.serial_tensor.name)
                 if (dist_tensor is not None) and (
                         not dist_tensor.validate_dist_attr()):
-                    assert False, "Tensor {} has a wrong distributed attributes {}.".format(
-                        dist_tensor.serial_tensor.name, dist_tensor.dist_attr)
+                    assert False, "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
+                        dist_tensor.serial_tensor.name,
+                        dist_tensor.desc.id(),
+                        dist_tensor.desc.original_id(), dist_tensor.dist_attr)
             for op in block.ops:
                 dist_op = self.get_dist_op_for_program(op)
                 assert dist_op is not None, \
                     "Operator {} does not have a distributed attribute.".format(
                         dist_op.serial_op.type)
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
-                    assert False, "Operator {} has a wrong distributed attributes {}.".format(
-                        dist_op.serial_op.type, dist_op.dist_attr)
+                    assert False, "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
+                        dist_op.serial_op.type,
+                        dist_op.serial_op.desc.id(),
+                        dist_op.serial_op.desc.original_id(), dist_op.dist_attr)
         return True
 
     def __deepcopy__(self, memo):
diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py
index a42ce863492b3..e3f06da275182 100644
--- a/python/paddle/distributed/auto_parallel/dist_tensor.py
+++ b/python/paddle/distributed/auto_parallel/dist_tensor.py
@@ -41,7 +41,7 @@ def _validate_sizes_and_dist_attr(sizes,
                                       rank=None,
                                       shard_sizes=None):
         if not (isinstance(sizes, (list, tuple)) and
-                all(map(lambda x: isinstance(x, int) and x > 0, sizes))):
+                all(map(lambda x: isinstance(x, int) and x >= 0, sizes))):
             raise ValueError(
                 "The sizes must be list or tuple and item in sizes must be non-negative integer, but got {}".
                 format(sizes))
@@ -79,8 +79,11 @@ def get_local_sizes(global_sizes,
 
         local_sizes = []
         # for even sharding, the local sizes of every rank are equal
+
         for idx, item in enumerate(global_sizes):
-            if dims_mapping[idx] == -1:
+            # This is a trick to avoid dims_mapping is []
+            val = dims_mapping[idx] if idx < len(dims_mapping) else -1
+            if val == -1:
                 local_sizes.append(item)
             else:
                 local_sizes.append(item // topology[dims_mapping[idx]])
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index c38953ca9e64d..ab9391cf66fdb 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -31,10 +31,11 @@
 from paddle.fluid.framework import Operator
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed import fleet
 from paddle.distributed.utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 
-from .cluster import Cluster
+# from .cluster import Cluster, get_default_cluster
 from .planner_v2 import Planner
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
@@ -57,7 +58,11 @@ def __init__(self,
         self.inputs_spec = self._validate_spec(inputs_spec)
         self.labels_spec = self._validate_spec(labels_spec)
         self.cluster = cluster
+        # if self.cluster is None:
+        #     self.cluster = get_default_cluster()
         self.strategy = strategy
+        if self.strategy is None:
+            self.strategy = fleet.DistributedStrategy()
 
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
@@ -69,11 +74,11 @@ def __init__(self,
         self._orig_main_prog = fluid.default_main_program()
         self._orig_startup_prog = fluid.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
+        self._dist_contexts = {}
         self._serial_main_progs = {}
         self._serial_startup_progs = {}
         self._dist_main_progs = defaultdict(dict)  # dist main programs
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
-        self._dist_contexts = {}
         self._feed_vars = {}
         self._fetch_vars = {}
 
@@ -104,11 +109,17 @@ def prepare(self,
             parallelizer.parallel(self._cur_rank)
         else:
             parallelizer.parallel_all()
-        # Get the distributed main programs and startup programs
+        # Get the current content from the distributed context 
+        self._serial_main_progs[mode] = self._dist_contexts[
+            mode].serial_main_program
+        self._serial_startup_progs[mode] = self._dist_contexts[
+            mode].serial_startup_program
         self._dist_main_progs[mode] = self._dist_contexts[
             mode].dist_main_programs
         self._dist_startup_progs[mode] = self._dist_contexts[
             mode].dist_startup_programs
+        self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
+        self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
         # Init comm and startup program
         self._initialize(mode)
 
@@ -135,20 +146,23 @@ def _build(self, mode):
             inputs = [self._set_data_parallel(var) for var in inputs]
             labels = [self._set_data_parallel(var) for var in labels]
 
-        self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+        # self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
+        feed_vars = {"inputs": inputs, "labels": labels}
 
-        self._fetch_vars[mode] = {
+        # self._fetch_vars[mode] = {
+        #     "outputs": flatten(outputs),
+        #     "loss": losses,
+        #     "metrics": metrics
+        # }
+        fetch_vars = {
             "outputs": flatten(outputs),
             "loss": losses,
             "metrics": metrics
         }
 
-        self._serial_main_progs[mode] = serial_main_prog
-        self._serial_startup_progs[mode] = serial_startup_prog
         self._dist_contexts[mode] = DistributedContext(
-            self._serial_main_progs[mode], self._serial_startup_progs[mode],
-            self._optimizer, losses, self._feed_vars[mode],
-            self._fetch_vars[mode], self.strategy)
+            serial_main_prog, serial_startup_prog, self._optimizer, losses,
+            feed_vars, fetch_vars, self.cluster, self.strategy)
         self._dist_contexts[mode].gradient_scale = self._gradient_scale
 
     def _initialize(self, mode):
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 3ff474697205e..295e3557df27d 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -16,7 +16,7 @@
 from .common import DistributedOperatorImpl
 from .common import register_distributed_operator_impl_container
 from .common import register_distributed_operator_impl
-from .common import find_best_compatible_distributed_operator_impl
+from .common import find_compatible_distributed_operator_impls
 from . import dist_embedding
 from . import dist_matmul
 from . import dist_reshape
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 441eb88a9f1ee..6b3c655f293bd 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -157,9 +157,7 @@ def register_distributed_operator_impl(op_type, dist_impl):
         assert False, "Must register distributed operator registry first."
 
 
-def find_best_compatible_distributed_operator_impl(dist_op,
-                                                   fwd=True,
-                                                   partial=True):
+def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
     """
     Here just return the first compatible implemention. 
     This will be improved by cost model in the future.
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 6d9b48ea1e87c..78f30422e742f 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -187,7 +187,7 @@ def is_auto_compatible(self, dist_op):
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if serial_tensor.is_parameter:
+            if serial_tensor is not None and serial_tensor.is_parameter:
                 for mapping in dims_mapping:
                     if mapping != -1:
                         return False
@@ -217,7 +217,7 @@ def is_auto_compatible(self, dist_op):
         for arg_name in op_desc.output_arg_names():
             serial_tensor = dist_op.get_serial_output(arg_name)
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if serial_tensor.is_parameter:
+            if serial_tensor is not None and serial_tensor.is_parameter:
                 for mapping in dims_mapping:
                     if mapping != -1:
                         return False
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
index 89cd2c9d9e41a..4d52e5a94beb1 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -22,7 +22,6 @@
 from .common import register_distributed_operator_impl
 from .common import set_comm_op_dist_attr_for_program
 from .dist_default import DistributedDefaultImpl0
-from ..reshard import Resharder
 from ..process_group import new_process_group
 from ..utils import is_dim_shard, is_dim_replicate, _get_corresponding_rank
 from ..utils import compute_compatible_dim_mapping, set_dist_op_desc_original_id, _get_comm_group
@@ -324,6 +323,8 @@ def backward(ctx, *args, **kwargs):
         process_mesh_shape = op_dist_attr.process_mesh.topology
         process_mesh_group = op_dist_attr.process_mesh.processes
         dims_mapping = [0] + [-1 for _ in range(len(new_X_grad.shape) - 1)]
+        from ..reshard import Resharder
+
         partition_idx = Resharder.compute_partition_index(
             rank_id, new_X_grad.shape, dims_mapping, process_mesh_shape,
             process_mesh_group)
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 4d73632761026..218513323dffb 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -35,7 +35,7 @@ def __init__(self, mode, completer, dist_context):
         self._mode = mode
         self._completer = completer
         self._dist_context = dist_context
-        self._dist_context.initialize()
+        assert self._dist_context._is_initialized
         self._pass_context = self._dist_context.pass_context
         self._strategy = self._dist_context.strategy
 
@@ -43,7 +43,9 @@ def parallel_all(self):
         world_process_group = get_world_process_group()
         all_ranks = world_process_group.ranks
         for rank in all_ranks:
+            # self._dist_context._backup(serial=True, dist=True)
             self.parallel(rank)
+            # self._dist_context._restore(serial=True, dist=True)
 
     def parallel(self, rank):
         serial_main_program = self._dist_context.serial_main_program
@@ -58,6 +60,7 @@ def parallel(self, rank):
             self._apply_pre_optimization(serial_main_program,
                                          serial_startup_program, serial_loss,
                                          serial_optimizer, params_grads)
+
             # Do logical partition
             partitioner = Partitioner(self._dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
@@ -85,7 +88,6 @@ def parallel(self, rank):
             resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
                                   self._dist_context, [], 1)
             resharder.reshard()
-
         # Clone program for test
         if self._mode != 'train':
             dist_main_prog = dist_main_prog.clone(for_test=True)
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
index 7db17e98d07ee..3625a25d74e0e 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -16,6 +16,8 @@
 from .dist_context import get_default_distributed_context
 from .utils import print_program_with_dist_attr
 
+# from .tuner.parallel_tuner import ParallelTuner
+
 
 class Planner:
     def __init__(self, mode, dist_context):
@@ -24,19 +26,28 @@ def __init__(self, mode, dist_context):
 
         # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
         # dependency of backward-forward ops in forward completion.
+        # TODO: The id mapping will be lost if we clone the original program.
         default_ctx = get_default_distributed_context()
         self._dist_context._dist_op_context = default_ctx.dist_op_context
         self._dist_context.initialize()
 
         self._completer = Completer(self._dist_context)
 
+        self._strategy = dist_context.strategy
+        # if self._strategy.auto_search:
+        #     self._parallel_tuner = ParallelTuner(
+        #         self._dist_context, mode=self._mode)
+
     @property
     def completer(self):
         return self._completer
 
     def plan(self):
         self._completer.complete_forward_annotation()
+        # if self._strategy.auto_search:
+        #     self._parallel_tuner.tune()
+        # else:
+        #     self._completer.complete_forward_annotation()
         # parse forward sub block
         self._dist_context.block_state.parse_forward_blocks(
             self._dist_context.serial_main_program)
-        # TODO: add the auto searcher
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index fbe3a43a7917a..42d90b0d4d619 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -324,10 +324,13 @@ def _get_corresponding_rank(dist_context, target_mesh, rank):
                                                 mesh.processes.index(rank))
             break
 
-    assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
-        rank)
-    return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
-                                                        coordinate)]
+    # assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
+    #     rank)
+    if coordinate is not None:
+        return target_mesh.processes[_coordinate2linear_idx(mesh.topology,
+                                                            coordinate)]
+    else:
+        return target_mesh.processes[0]
 
 
 def _get_unshard_dist_shape(var, dist_attr):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 346939fb5ce28..381461130ed5c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -31,4 +31,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
     py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
     py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
+    py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
new file mode 100644
index 0000000000000..f7718e584f5e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+import numpy as np
+import paddle.nn as nn
+import paddle.utils as utils
+import paddle.static as static
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [[0, 1], [2, 3]]
+
+
+def get_random_inputs_and_labels(input_shape, label_shape):
+    input = np.random.random(size=input_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('float32')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_size):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, sequence_len, hidden_size],
+                [batch_size, sequence_len, 1])
+            yield batch_input, batch_label
+
+    return __reader__
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(
+            mean=0.0, std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        auto.shard_tensor(
+            self.linear0.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [-1, 0]
+            })
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        auto.shard_tensor(
+            self.linear1.weight,
+            dist_attr={
+                "process_mesh": _g_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
+        out = self.linear1(out)
+
+        return out
+
+
+def get_program():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+
+    train_program = static.Program()
+    start_program = static.Program()
+    with static.program_guard(train_program, start_program):
+        # input
+        input = static.data(
+            name="input",
+            shape=[batch_size, sequence_len, hidden_size],
+            dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+        data_holder = [input, label]
+        # dataloader
+        dataloader = paddle.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=4 * batch_size, iterable=False)
+        dataloader.set_batch_generator(
+            batch_generator_creator(), places=paddle.static.cuda_places())
+        # data dist_attr
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [0, -1, -1]
+            })
+        auto.shard_tensor(
+            label,
+            dist_attr={
+                "process_mesh": _g_process_mesh[0],
+                "dims_mapping": [0, -1, -1]
+            })
+
+        mlp_start = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_start(input)
+
+        mlp_mid = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_mid(pred)
+
+        mlp_end = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        pred = mlp_end(pred)
+
+        error_cost = paddle.nn.functional.square_error_cost(pred, label)
+        loss = paddle.mean(error_cost)
+
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None)
+
+        feed_vars = {"inputs": [input], "labels": [label]}
+        fetch_vars = {"loss": [loss]}
+
+    return train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars
+
+
+class TestDistributedContext(unittest.TestCase):
+    def test_backup_restore(self):
+        train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program(
+        )
+        dist_context = DistributedContext(train_program, start_program,
+                                          optimizer, loss, feed_vars,
+                                          fetch_vars)
+        dist_context.initialize()
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(
+            serial=True,
+            serial_mode="to_backup",
+            dist=True,
+            dist_mode="to_backup")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(
+            serial=True,
+            serial_mode="to_original",
+            dist=True,
+            dist_mode="to_original")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(serial=True, dist=True, dist_mode="to_default")
+
+        dist_context._backup(serial=True, dist=True)
+        dist_context._restore(serial=True, dist=True, dist_mode="to_nothing")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index aa0bf719fab29..8af055a09a343 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -94,7 +94,8 @@ def test_dist_slice_serial(self):
         ops = dist_main_prog.global_block().ops
         for op in ops:
             op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-            assert op_dist_attr.impl_type == "slice"
+            # We amend this impl_type after completion
+            assert op_dist_attr.impl_type == "default"
             for out in op.output_arg_names:
                 var_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
                 ref_dims_mapping = [-1 for i in range(len(var_dims_mapping))]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
index 1179fd9a9f088..9989f5bbdc605 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -27,7 +27,7 @@
 from paddle.distributed.auto_parallel.utils import make_data_unshard
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
-from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index 894bed7108a1d..d296d9433302d 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -28,7 +28,7 @@
 from paddle.distributed.auto_parallel.utils import make_data_unshard
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
-from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl
+from paddle.distributed.auto_parallel.operators import find_compatible_distributed_operator_impls
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()

From bd01836016137dc9564f6c26bf4fb5c3b19ff950 Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Wed, 1 Jun 2022 10:22:06 +0800
Subject: [PATCH 105/109] add some comp op costs (#43114)

---
 .../auto_parallel/cost/comp_op_cost.py        | 610 ++++++++++++++++--
 .../unittests/auto_parallel/test_comp_cost.py | 178 +++++
 2 files changed, 752 insertions(+), 36 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index 28d2e2d5a3088..8958c4bf905c2 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -23,7 +23,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(AssignOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -41,7 +41,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(AssignValueOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -59,7 +59,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(BeamSearchOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -77,7 +77,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(BeamSearchDecodeOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -95,7 +95,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(CastOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -113,7 +113,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ConcatOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -131,7 +131,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseAddOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -149,7 +149,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseAddGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -167,7 +167,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseDivOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -185,7 +185,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseDivGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -203,7 +203,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseMulOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -221,7 +221,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseMulGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -239,7 +239,25 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(ElementwiseSubOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ElementwiseSubGradOpCost(CompOpCost):
+    OP_TYPE = "elementwise_sub_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ElementwiseSubGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -257,7 +275,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(EmbeddingOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -275,7 +293,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(EmbeddingGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -293,7 +311,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -311,7 +329,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantBatchSizeLikeOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -329,7 +347,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(FillConstantBatchSizeLikeGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -347,7 +365,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GatherOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -365,7 +383,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GeluOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -383,7 +401,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GeluGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -401,7 +419,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(GreaterEqualOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -419,7 +437,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(IncrementOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -433,7 +451,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(IsEmptyOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -447,7 +465,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LayerNormOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -465,7 +483,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LayerNormGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -483,7 +501,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LessThanOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -501,7 +519,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogicalNotOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -519,7 +537,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogicalAndOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -537,7 +555,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LodResetOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -554,7 +572,7 @@ class LogOpCost(CompOpCost):
     def __init__(self, op=None, op_desc=None, cluster=None):
         super(LogOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -572,7 +590,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LookupTableV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -590,7 +608,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(LookupTableV2GradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -608,7 +626,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -626,7 +644,7 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulGradOpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
@@ -644,7 +662,527 @@ def __init__(self, op=None, op_desc=None, cluster=None):
         super(MatmulV2OpCost, self).__init__(
             op=op, op_desc=op_desc, cluster=cluster)
 
-    #  For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MatmulV2GradOpCost(CompOpCost):
+    OP_TYPE = "matmul_v2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MatmulV2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MemcpyOpCost(CompOpCost):
+    OP_TYPE = "memcpy"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MemcpyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MulOpCost(CompOpCost):
+    OP_TYPE = "mul"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MulOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class MulGradOpCost(CompOpCost):
+    OP_TYPE = "mul_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(MulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class OneHotOpCost(CompOpCost):
+    OP_TYPE = "one_hot"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(OneHotOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReadFromArrayOpCost(CompOpCost):
+    OP_TYPE = "read_from_array"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReadFromArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceSumOpCost(CompOpCost):
+    OP_TYPE = "reduce_sum"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceSumGradOpCost(CompOpCost):
+    OP_TYPE = "reduce_sum_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceSumGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Reshape2OpCost(CompOpCost):
+    OP_TYPE = "reshape2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Reshape2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Reshape2GradOpCost(CompOpCost):
+    OP_TYPE = "reshape2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Reshape2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceMeanOpCost(CompOpCost):
+    OP_TYPE = "reduce_mean"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceMeanOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ReduceMeanGradOpCost(CompOpCost):
+    OP_TYPE = "reduce_mean_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ReduceMeanGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SamplingIdOpCost(CompOpCost):
+    OP_TYPE = "sampling_id"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SamplingIdOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ScaleOpCost(CompOpCost):
+    OP_TYPE = "scale"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ScaleOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SliceOpCost(CompOpCost):
+    OP_TYPE = "slice"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SliceOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxOpCost(CompOpCost):
+    OP_TYPE = "softmax"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxGradOpCost(CompOpCost):
+    OP_TYPE = "softmax_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxWithCrossEntropyOpCost(CompOpCost):
+    OP_TYPE = "softmax_with_cross_entropy"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxWithCrossEntropyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SoftmaxWithCrossEntropyGradOpCost(CompOpCost):
+    OP_TYPE = "softmax_with_cross_entropy_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SplitOpCost(CompOpCost):
+    OP_TYPE = "split"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SplitOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Squeeze2OpCost(CompOpCost):
+    OP_TYPE = "squeeze2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Squeeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SquareOpCost(CompOpCost):
+    OP_TYPE = "square"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SquareOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SquareGradOpCost(CompOpCost):
+    OP_TYPE = "square_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SquareGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class SumOpCost(CompOpCost):
+    OP_TYPE = "sum"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(SumOpCost, self).__init__(op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class TopKOpCost(CompOpCost):
+    OP_TYPE = "top_k"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(TopKOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Transpose2OpCost(CompOpCost):
+    OP_TYPE = "transpose2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Transpose2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Transpose2GradOpCost(CompOpCost):
+    OP_TYPE = "transpose2_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Transpose2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class Unsqueeze2OpCost(CompOpCost):
+    OP_TYPE = "unsqueeze2"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(Unsqueeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class WriteToArrayOpCost(CompOpCost):
+    OP_TYPE = "write_to_array"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(WriteToArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster)
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
         # NOTE: The actual formula will be filled in the future
         return 0
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
index 4cdd51e42adf0..af7a44b5aaa23 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
@@ -54,6 +54,35 @@
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulGradOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MatmulV2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MemcpyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MulOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import MulGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import OneHotOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReadFromArrayOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceSumOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceSumGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Reshape2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Reshape2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceMeanOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ReduceMeanGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SamplingIdOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import ScaleOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SliceOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxWithCrossEntropyOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SoftmaxWithCrossEntropyGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SplitOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Squeeze2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SquareOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SquareGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import SumOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import TopKOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2GradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import Unsqueeze2OpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import WriteToArrayOpCost
 
 from test_cluster import cluster_json
 
@@ -244,6 +273,155 @@ def test_comp_cost(self):
         self.assertTrue(op_cost.time >= 0)
         self.assertTrue(op_cost.memory >= 0)
 
+        op_cost = MatmulV2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MemcpyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MulOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MulGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = OneHotOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReadFromArrayOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceSumOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceSumGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Reshape2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = MatmulV2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Reshape2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceMeanOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ReduceMeanGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SamplingIdOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = ScaleOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SliceOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxWithCrossEntropyOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SoftmaxWithCrossEntropyGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SplitOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Squeeze2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SquareOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SquareGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = SumOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = TopKOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Transpose2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Transpose2GradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = Unsqueeze2OpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = WriteToArrayOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)

From 81622708a7c904092185ef04897b1e81629f51a6 Mon Sep 17 00:00:00 2001
From: huzhiqiang <912790387@qq.com>
Date: Wed, 1 Jun 2022 10:32:03 +0800
Subject: [PATCH 106/109]  [revert] revert inference accelarate #43125

---
 paddle/fluid/framework/operator.cc | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7dc885f54ab6c..69f14d7903c0b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1259,11 +1259,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
     pre_scope_ = cur_scope;
-  } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
-             !need_prepare_phi_data_) {
-    if (!all_kernels_must_compute_runtime_shape_)
-      this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
-    (*pt_kernel_)(impl_->getKernelContext());
   } else {
     if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
       std::lock_guard<std::mutex> lock(cache_update_mutex_);
@@ -1528,22 +1523,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::TracerEventType::OperatorInner,
                                        1, platform::EventRole::kInnerOp);
     if (run_phi_kernel_) {
+      phi::KernelContext pt_kernel_context;
+      // Do data transform before building KernelContext
+      // TODO(zhiqiu): support TransferInplaceVarsBack
       PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx);
-      if (enable_cache_runtime_context_ && !need_prepare_phi_data_ &&
-          !need_prepare_data_) {
-        impl_ =
-            new CacheImpl(new phi::KernelContext(),
-                          new RuntimeInferShapeContext(*this, *runtime_ctx));
-        BuildPhiKernelContext(*runtime_ctx, dev_ctx, impl_->getKernelContext());
-
-        (*pt_kernel_)(impl_->getKernelContext());
-      } else {
-        phi::KernelContext pt_kernel_context;
-        // Do data transform before building KernelContext
-        // TODO(zhiqiu): support TransferInplaceVarsBack
-        BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
-        (*pt_kernel_)(&pt_kernel_context);
-      }
+      BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+      (*pt_kernel_)(&pt_kernel_context);
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));

From 77bae9a45b4870006b1f3b12ee9ffdc319864a89 Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Wed, 1 Jun 2022 11:36:24 +0800
Subject: [PATCH 107/109] fix the bug of adamw which set the attribute in param
 group not working (#43013)

* fix the bug of adamw which set the attribute in param group not working

* fix undefined variable

* fix api example typo

* add unittest

* fix unittest typo
---
 .../fluid/tests/unittests/test_adamw_op.py    | 109 +++++
 python/paddle/optimizer/adamw.py              | 412 ++++++++++++++----
 2 files changed, 431 insertions(+), 90 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 3e2f112e964bb..225bd35a8ec9d 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -271,6 +271,115 @@ def test_adamw_op_dygraph(self):
             adam.clear_gradients()
 
 
+class TestAdamWOpMultiPrecison(unittest.TestCase):
+    def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
+        paddle.disable_static()
+        paddle.seed(10)
+        paddle.set_device(place)
+
+        input = paddle.randn((5, 5))
+
+        model = paddle.nn.Linear(5, 5)
+
+        optimizer = paddle.optimizer.AdamW(
+            parameters=[{
+                'params': model.parameters(),
+                'weight_decay': 0.001,
+                'beta1': 0.1,
+                'beta2': 0.99
+            }],
+            multi_precision=use_amp)
+
+        for idx in range(2):
+            if place == 'gpu' and use_amp == True:
+                model = paddle.amp.decorate(models=model, level='O2')
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+            if place == 'gpu' and use_amp == True:
+                with paddle.amp.auto_cast(level='O2'):
+                    output = model(input)
+                    loss = paddle.mean(output)
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                optimizer.clear_grad()
+            else:
+                output = model(input)
+                loss = paddle.mean(output)
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+
+    def _get_places(self):
+        places = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            places.append('gpu')
+        return places
+
+    def test_main(self):
+        for place in self._get_places():
+            use_amp_list = [True, False]
+            for use_amp in use_amp_list:
+                self._test_adamw_op_dygraph_place_amp(place, use_amp)
+
+
+class TestAdamWOpError(unittest.TestCase):
+    def test_api_errors(self):
+        def test_weight_decay_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=linear.parameters(),
+                weight_decay=1)
+
+        def test_parameters_dtype1():
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=paddle.randn((5, 5)),
+                weight_decay=0.1)
+
+        def test_parameters_dtype2():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters={'params': linear.parameters()},
+                weight_decay=0.1)
+
+        def test_parameters_dtype3():
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01, parameters=None, weight_decay=0.1)
+
+        def test_parameters_dtype4():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters={'params': set(linear.parameters())},
+                weight_decay=0.1)
+
+        def test_learning_rate_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=1,
+                parameters=linear.parameters(),
+                weight_decay=0.1)
+
+        def test_grad_clip_dtype():
+            linear = paddle.nn.Linear(13, 5)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.01,
+                parameters=linear.parameters(),
+                weight_decay=0.1,
+                grad_clip=0.1)
+
+        self.assertRaises(TypeError, test_weight_decay_dtype)
+        self.assertRaises(TypeError, test_parameters_dtype1)
+        self.assertRaises(TypeError, test_parameters_dtype2)
+        self.assertRaises(AttributeError, test_parameters_dtype3)
+        self.assertRaises(TypeError, test_parameters_dtype4)
+        self.assertRaises(TypeError, test_learning_rate_dtype)
+        self.assertRaises(TypeError, test_grad_clip_dtype)
+
+
 class TestAdamWOpGroupWithLR(TestAdamWOp):
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 0fa49745a95fb..0b61f3cb9a787 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -12,11 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+from collections import defaultdict
 from .optimizer import Optimizer
-from .adam import Adam
+from .lr import LRScheduler
 from ..fluid import core
 from ..fluid import framework
-from ..fluid.framework import Variable
+from ..fluid.framework import Variable, Parameter
+from ..fluid import unique_name
+from ..fluid import layers
+from ..fluid.layer_helper import LayerHelper
+from ..fluid.clip import GradientClipBase
 from ..fluid.dygraph import base as imperative_base
 from collections.abc import Callable
 from .. import _C_ops
@@ -25,7 +31,7 @@
 __all__ = []
 
 
-class AdamW(Adam):
+class AdamW(Optimizer):
     r"""
     The AdamW optimizer is implemented based on the AdamW Optimization
     in paper `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
@@ -102,14 +108,14 @@ class AdamW(Adam):
             beta1 = paddle.to_tensor([0.9], dtype="float32")
             beta2 = paddle.to_tensor([0.99], dtype="float32")
 
-            adam = paddle.optimizer.AdamW(learning_rate=0.1,
+            opt = paddle.optimizer.AdamW(learning_rate=0.1,
                     parameters=linear.parameters(),
                     beta1=beta1,
                     beta2=beta2,
                     weight_decay=0.01)
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            opt.step()
+            opt.clear_grad()
 
 
             #Note that the learning_rate of linear_2 is 0.01.
@@ -119,7 +125,7 @@ class AdamW(Adam):
             out = linear_1(inp)
             out = linear_2(out)
             loss = paddle.mean(out)
-            adam = paddle.optimizer.AdamW(
+            opt = paddle.optimizer.AdamW(
                 learning_rate=0.1,
                 parameters=[{
                     'params': linear_1.parameters()
@@ -132,11 +138,16 @@ class AdamW(Adam):
                 weight_decay=0.01,
                 beta1=0.9)                   
             out.backward()
-            adam.step()
-            adam.clear_grad()
+            opt.step()
+            opt.clear_grad()
 
     """
 
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+    _beta1_pow_acc_str = "beta1_pow_acc"
+    _beta2_pow_acc_str = "beta2_pow_acc"
+
     def __init__(self,
                  learning_rate=0.001,
                  beta1=0.9,
@@ -160,37 +171,108 @@ def __init__(self,
             raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
         if not 0 <= epsilon:
             raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
-        coeff = weight_decay
-        if not isinstance(coeff, float) and \
-                not isinstance(coeff, framework.Variable):
-            raise TypeError("coeff should be float or Tensor.")
-        self._params_name = set()
-        self._apply_decay_param_fun = apply_decay_param_fun
-        self._coeff = coeff
-        self._lr_to_coeff = dict()
+        if not isinstance(weight_decay, float) and \
+                not isinstance(weight_decay, framework.Variable):
+            raise TypeError("weight_decay should be float or Tensor.")
         if lr_ratio is not None:
             assert isinstance(lr_ratio, Callable)
             if not core.is_compiled_with_cuda():
                 raise NotImplementedError(
                     "'lr_ratio' is unimplemented in CPU, XPU and NPU")
-        self._lr_ratio = lr_ratio
 
-        super(AdamW, self).__init__(
-            learning_rate=learning_rate,
-            parameters=parameters,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            grad_clip=grad_clip,
-            name=name,
-            lazy_mode=lazy_mode,
-            multi_precision=multi_precision)
-        self._default_dict = {'coeff': coeff}
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            if isinstance(parameters, dict):
+                raise TypeError(
+                    "`parameters` argument should not get dict type, "
+                    "if parameter groups is needed, please set `parameters`"
+                    " as list of dict")
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
+        self._name = name
+        if framework._non_static_mode():
+            if self._parameter_list is None:
+                raise AttributeError(
+                    "parameters argument given to the Optimizer should not be None in dygraph mode."
+                )
+
+        if not isinstance(learning_rate, (float, LRScheduler)):
+            raise TypeError(
+                "learning rate should be float or LRScheduler, got %s here" %
+                type(learning_rate))
+        if grad_clip is not None:
+            if not isinstance(grad_clip, GradientClipBase):
+                raise TypeError(
+                    "'grad_clip' should be an instance of GradientClipBase's derived class"
+                )
+
+        self._dtype = None
+        # Infer the dtype form parameter
+        if self._parameter_list:
+            if isinstance(self._parameter_list[0], dict):
+                for param_group in self._parameter_list:
+                    assert 'params' in param_group, \
+                        'params should be set in parameters if parameter groups are optimized in different options'
+                self._dtype = self._parameter_list[0]['params'][0].dtype
+            else:
+                self._dtype = self._parameter_list[0].dtype
+
+        # each program should have a independent learning rate
+        # program -> tensor(learning_rate)
+        self._learning_rate_map = dict()
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra tensors associated with the parameters
+        # to train. These tensors are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+        self._opti_name_list = []
+        self._accumulators_holder = {}
+        self._param_device_map = dict()
+        self.clear_gradients = self.clear_grad
 
         self.type = "adamw"
+        self._learning_rate = learning_rate
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._weight_decay = weight_decay
+        self._grad_clip = grad_clip
+        self._lr_ratio = lr_ratio
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+        self._lazy_mode = lazy_mode
+        self._multi_precision = multi_precision
+        self._master_weights = {}
+
+        self._default_dict = {
+            'weight_decay': weight_decay,
+            'beta1': beta1,
+            'beta2': beta2,
+            'epsilon': epsilon,
+            'lazy_mode': lazy_mode,
+            'grad_clip': grad_clip
+        }
+
+        self._param_groups = []
+        if self._parameter_list and isinstance(self._parameter_list[0], dict):
+            for param_group in self._parameter_list:
+                self._add_param_group(param_group.copy())
+        else:
+            self._param_groups = self._parameter_list
 
-        # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that.
-        self._auxiliary_vars = dict()
+        self._use_multi_tensor = None
+        self.regularization = None
+        self._auxiliary_vars = {}
 
     def _set_auxiliary_var(self, key, val):
         self._auxiliary_vars[key] = val
@@ -201,58 +283,128 @@ def _get_auxiliary_var(self, key):
         else:
             return None
 
-    def _append_decoupled_weight_decay(self, block, param_and_grad):
+    def _add_param_group(self, param_group):
         """
-        Add decoupled weight decay op.
-            parameter = parameter - parameter * coeff * lr
+        Add a param group to parameter_list.
+
         Args:
-            block: block in which variable is to be created
-            param_and_grad: (parameters, gradients) pairs,
-                the parameters need to decay.
-        Raises:
-            Exception: The type of coeff and parameter is not consistent.
+            param_group (dict): The group of Tensors to be optimzed with
+            different optimization options.
         """
-        if isinstance(param_and_grad, dict):
-            param_and_grad = self._update_param_group(param_and_grad)
-        param, grad = param_and_grad
+        params = param_group['params']
+        if isinstance(params, Parameter):
+            param_group['params'] = [params]
+        elif isinstance(params, set):
+            raise TypeError(
+                "optimizer parameters should be in ordered collections,"
+                "but received set, please use list instead.")
+        else:
+            param_group['params'] = list(params)
 
-        if self._apply_decay_param_fun is not None \
-                and not self._apply_decay_param_fun(param.name):
-            return
+        # Update optimization options for each groups
+        for k, v in self._default_dict.items():
+            param_group.setdefault(k, v)
+
+        param_set = set()
+        for group in self._param_groups:
+            param_set.update(set(group['params']))
+
+        if not param_set.isdisjoint(set(param_group['params'])):
+            raise ValueError(
+                "some parameters appear in more than one parameter group")
 
-        if isinstance(self._learning_rate, float):
-            learning_rate = self._learning_rate
+        for param in param_group['params']:
+            param.optimize_attr['learning_rate'] = param_group.get(
+                'learning_rate', 1.)
+
+        self._param_groups.append(param_group)
+
+    def _create_master_weight(self, param):
+        if param.name in self._master_weights:
+            var = self._master_weights[param.name]
         else:
-            # NOTE. We add this function to the _append_optimize_op(),
-            # for we must make sure _create_param_lr() be called after
-            # optimizer._create_global_learning_rate().
-            learning_rate = self._create_param_lr(param_and_grad)
-
-        with block.program._optimized_guard(
-            [param, grad]), framework.name_scope('weight decay'):
-            self._params_name.add(param.name)
-
-            # If it has been calculated, the result will be reused.
-            # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
-            # every step, so need clear _lr_to_coeff every step,
-            # we do this in _create_optimization_pass
-            decay_coeff = self._lr_to_coeff.get(learning_rate, None)
-            if decay_coeff is None:
-                # NOTE(wangxi): for pipeline to set device:all
-                with paddle.static.device_guard(None):
-                    decay_coeff = 1.0 - learning_rate * self._coeff
-                self._lr_to_coeff[learning_rate] = decay_coeff
-
-            find_master = (self._multi_precision and
-                           param.dtype == core.VarDesc.VarType.FP16)
-            if find_master:
-                master_weight = self._master_weights[param.name]
-                scaled_param = master_weight * decay_coeff
-                paddle.fluid.layers.assign(
-                    input=scaled_param, output=master_weight)
-            else:
-                scaled_param = param * decay_coeff
-                paddle.fluid.layers.assign(input=scaled_param, output=param)
+            assert isinstance(self.helper, LayerHelper)
+
+            var_name = param.name + "_fp32_master"
+            var_name = unique_name.generate(var_name)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True)
+            block = self.helper.startup_program.global_block()
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32
+                })
+            self._master_weights[param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+        Returns:
+            accumulator variable for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        target_param = self._master_weights[
+            param.name] if find_master else param
+        target_name = target_param.name
+        if (name not in self._accumulators or
+                target_name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, target_name))
+        return self._accumulators[name][target_name]
+
+    def _add_moments_pows(self, p):
+        acc_dtype = p.dtype
+        if acc_dtype == core.VarDesc.VarType.FP16:
+            acc_dtype = core.VarDesc.VarType.FP32
+        self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
+        self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
+        self._add_accumulator(
+            name=self._beta1_pow_acc_str,
+            param=p,
+            dtype=acc_dtype,
+            fill_value=0.9 if isinstance(self._beta1, Variable) \
+                    else self._beta1,
+            shape=[1],
+            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+        self._add_accumulator(
+            name=self._beta2_pow_acc_str,
+            param=p,
+            dtype=acc_dtype,
+            fill_value=0.999 if isinstance(self._beta2, Variable) \
+                    else self._beta2,
+            shape=[1],
+            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                self._add_moments_pows(master_p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Adam optimizer."
+                )
+            self._add_moments_pows(p)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -295,8 +447,9 @@ def _append_optimize_op(self, block, param_and_grad):
                 _, _, _, _, _, _ = _C_ops.final_state_adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
                     beta1_pow_acc, beta2_pow_acc, master_weight, found_inf,
-                    _beta1, _beta2, self._epsilon, lr_ratio_, self._coeff,
-                    with_decay, self._lazy_mode, 1000, find_master, False)
+                    _beta1, _beta2, self._epsilon, lr_ratio_,
+                    self._weight_decay, with_decay, self._lazy_mode, 1000,
+                    find_master, False)
             else:
                 _, _, _, _, _, _ = _C_ops.adamw(
                     param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
@@ -306,8 +459,8 @@ def _append_optimize_op(self, block, param_and_grad):
                     'lazy_mode', self._lazy_mode,
                     'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
                     'beta2', _beta2, "with_decay", with_decay, 'coeff',
-                    self._coeff, 'multi_precision', find_master, 'lr_ratio',
-                    lr_ratio_)
+                    self._weight_decay, 'multi_precision', find_master,
+                    'lr_ratio', lr_ratio_)
             return None
 
         inputs = {
@@ -338,7 +491,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "min_row_size_to_use_multithread": 1000,
             "multi_precision": find_master,
             "with_decay": with_decay,
-            "coeff": self._coeff,
+            "coeff": self._weight_decay,
             "lr_ratio": 1.
             if self._lr_ratio is None else self._lr_ratio(param_and_grad[0])
         }
@@ -369,17 +522,96 @@ def _append_optimize_op(self, block, param_and_grad):
 
         return adamw_op
 
-    def _create_optimization_pass(self, parameters_and_grads):
-        optimize_ops = super(
-            AdamW, self)._create_optimization_pass(parameters_and_grads)
-        # In dygraph mode, clear _lr_to_coeff after applied gradient
-        self._lr_to_coeff = dict()
-        return optimize_ops
-
     def __str__(self):
         return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
 
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer and update parameters once.
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                
+                a = paddle.rand([2,13], dtype="float32")
+                linear = paddle.nn.Linear(13, 5)
+                # This can be any optimizer supported by dygraph.
+                opt = paddle.optimizer.AdamW(learning_rate = 0.01,
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                opt.step()
+                opt.clear_grad()
+        """
+        if not isinstance(self._parameter_list[0], dict):
+            params_grads = []
+            for param in self._parameter_list:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+                    if framework.in_dygraph_mode():
+                        if hasattr(grad_var, "is_selected_rows"
+                                   ) and grad_var.is_selected_rows(
+                                   ) and self.regularization is not None:
+                            raise RuntimeError(
+                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                            )
+                    else:
+                        if hasattr(grad_var,
+                                   "_is_sparse") and grad_var._is_sparse(
+                                   ) and self.regularization is not None:
+                            raise RuntimeError(
+                                "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                            )
+                    params_grads.append((param, grad_var))
+
+            optimize_ops = self._apply_optimize(
+                loss=None, startup_program=None, params_grads=params_grads)
+        else:
+            # optimize parameters in groups
+            for param_group in self._param_groups:
+                params_grads = defaultdict(lambda: list())
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        if framework.in_dygraph_mode():
+                            if hasattr(grad_var, "is_selected_rows"
+                                       ) and grad_var.is_selected_rows(
+                                       ) and self.regularization is not None:
+                                raise RuntimeError(
+                                    "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                                )
+                        else:
+                            if hasattr(grad_var,
+                                       "_is_sparse") and grad_var._is_sparse(
+                                       ) and self.regularization is not None:
+                                raise RuntimeError(
+                                    "AdamW don't support weight_decay with sparse parameters, please set it to None."
+                                )
+                        params_grads['params'].append((param, grad_var))
+                params_grads.update(
+                    {k: v
+                     for k, v in param_group.items() if k != 'params'})
+                self._apply_optimize(
+                    loss=None, startup_program=None, params_grads=params_grads)
+
     def _update_param_group(self, parameters):
-        self._coeff = parameters.get('coeff', self._default_dict['coeff'])
+        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
+        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._lazy_mode = parameters.get('lazy_mode',
+                                         self._default_dict['lazy_mode'])
+        self._weight_decay = parameters.get('weight_decay',
+                                            self._default_dict['weight_decay'])
         parameters = parameters.get('params')
+
         return parameters

From 0e10f247d609f5755e29f5a940d2e43c43fd17a6 Mon Sep 17 00:00:00 2001
From: zhiboniu <31800336+zhiboniu@users.noreply.github.com>
Date: Wed, 1 Jun 2022 12:08:33 +0800
Subject: [PATCH 108/109] fluid code transfer in nn.functional (#42808)

---
 python/paddle/fluid/layers/loss.py            | 141 +---
 python/paddle/fluid/layers/nn.py              | 152 +----
 python/paddle/fluid/layers/sequence_lod.py    |  31 +-
 python/paddle/framework/__init__.py           |   1 +
 python/paddle/nn/functional/__init__.py       |   4 +-
 python/paddle/nn/functional/activation.py     |   2 +-
 python/paddle/nn/functional/common.py         | 150 ++++-
 python/paddle/nn/functional/extension.py      | 243 ++++++-
 python/paddle/nn/functional/loss.py           | 632 ++++++++++++++++--
 .../paddle/tensor/layer_function_generator.py |  11 +-
 10 files changed, 986 insertions(+), 381 deletions(-)

diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index b78865a0ece4e..99c0a2e70b771 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -336,28 +336,7 @@ def square_error_cost(input, label):
             # [0.01, 0.01]
 
     """
-    if _non_static_mode():
-        minus_out = _C_ops.elementwise_sub(input, label)
-        square_out = _C_ops.square(minus_out)
-        return square_out
-
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'square_error_cost')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'],
-                             'square_error_cost')
-    helper = LayerHelper('square_error_cost', **locals())
-    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='elementwise_sub',
-        inputs={'X': [input],
-                'Y': [label]},
-        outputs={'Out': [minus_out]})
-
-    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(
-        type='square', inputs={'X': [minus_out]},
-        outputs={'Out': [square_out]})
-    return square_out
+    return paddle.nn.functional.square_error_cost(input, label)
 
 
 def edit_distance(input,
@@ -433,45 +412,8 @@ def edit_distance(input,
             # [4]
 
     """
-    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
-    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
-    helper = LayerHelper("edit_distance", **locals())
-
-    # remove some tokens from input and labels
-    if ignored_tokens is not None and len(ignored_tokens) > 0:
-        erased_input = helper.create_variable_for_type_inference(dtype="int64")
-        erased_label = helper.create_variable_for_type_inference(dtype="int64")
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [input]},
-            outputs={"Out": [erased_input]},
-            attrs={"tokens": ignored_tokens})
-        input = erased_input
-
-        helper.append_op(
-            type="sequence_erase",
-            inputs={"X": [label]},
-            outputs={"Out": [erased_label]},
-            attrs={"tokens": ignored_tokens})
-        label = erased_label
-
-    this_inputs = {"Hyps": [input], "Refs": [label]}
-    if input_length is not None and label_length is not None:
-        this_inputs['HypsLength'] = [input_length]
-        this_inputs['RefsLength'] = [label_length]
-
-    # edit distance op
-    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
-    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(
-        type="edit_distance",
-        inputs=this_inputs,
-        outputs={"Out": [edit_distance_out],
-                 "SequenceNum": [sequence_num]},
-        attrs={"normalized": normalized})
-
-    return edit_distance_out, sequence_num
+    return paddle.nn.functional.loss.edit_distance(
+        input, label, normalized, ignored_tokens, input_length, label_length)
 
 
 def warpctc(input,
@@ -1279,52 +1221,9 @@ def softmax_with_cross_entropy(logits,
             out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
             print(out)
     """
-    if _non_static_mode():
-        if core.is_compiled_with_npu():
-            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
-        else:
-            if in_dygraph_mode():
-                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
-                    logits, label, soft_label, True, numeric_stable_mode,
-                    ignore_index, axis)
-            if _in_legacy_dygraph():
-                softmax, loss = _C_ops.softmax_with_cross_entropy(
-                    logits, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                    'axis', axis)
-        if not return_softmax:
-            return loss
-        else:
-            return loss, softmax
-
-    attrs = {
-        'soft_label': soft_label,
-        'ignore_index': ignore_index,
-        'numeric_stable_mode': numeric_stable_mode,
-        'axis': axis
-    }
-    helper = LayerHelper('softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-
-    outputs = {'Softmax': softmax, 'Loss': loss}
-    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
-        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
-        outputs['Backprop'] = backprop
-    helper.append_op(
-        type='softmax_with_cross_entropy',
-        inputs={'Logits': logits,
-                'Label': label},
-        outputs=outputs,
-        attrs=attrs)
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
+    return paddle.nn.functional.loss.fluid_softmax_with_cross_entropy(
+        logits, label, soft_label, ignore_index, numeric_stable_mode,
+        return_softmax, axis)
 
 
 def rank_loss(label, left, right, name=None):
@@ -1733,33 +1632,7 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
           print(npair_loss)
   
     """
-    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
-                             'npair_loss')
-    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                             'positive')
-    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
-                             'labels')
-    Beta = 0.25
-    batch_size = labels.shape[0]
-
-    labels = nn.reshape(labels, shape=[batch_size, 1])
-    labels = paddle.tile(labels, repeat_times=[1, batch_size])
-
-    labels = equal(labels, nn.transpose(labels, perm=[1, 0])).astype('float32')
-    labels = labels / nn.reduce_sum(labels, dim=1, keep_dim=True)
-
-    l2loss = nn.reduce_mean(nn.reduce_sum(square(anchor), 1)) \
-             + nn.reduce_mean(nn.reduce_sum(square(positive), 1))
-    l2loss = l2loss * Beta * l2_reg
-
-    similarity_matrix = paddle.matmul(
-        anchor, positive, transpose_x=False, transpose_y=True)
-    softmax_ce = softmax_with_cross_entropy(
-        logits=similarity_matrix, label=labels, soft_label=True)
-    cross_entropy = nn.reduce_sum(labels * softmax_ce, 0)
-    celoss = nn.reduce_mean(cross_entropy)
-
-    return l2loss + celoss
+    return paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg)
 
 
 def mse_loss(input, label):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3391654f93117..7fb9f6057b55a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -7394,30 +7394,8 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
-    assert input.dtype in (paddle.float32, paddle.float64)
-    assert label.dtype in (paddle.int32, paddle.int64)
-    assert len(input.shape) >= 2, \
-        "The rank of input should be greater than or equal to 2."
-    assert len(input.shape) == len(label.shape), (
-        "The rank of input and label should be equal, "
-        "but received input: %d, label: %d." %
-        (len(input.shape), len(label.shape)))
-    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
-                                  "but received %d." % label.shape[-1])
-    assert input.shape[:-1] == label.shape[:-1], (
-        "All dimensions should be equal except the last one.")
-    assert input.numel() > 0 and label.numel() > 0, \
-        "Any dimension of input and label cannot be equal to 0."
-
-    label = squeeze(label, [-1])
-    label = paddle.nn.functional.one_hot(label, input.shape[-1])
-    reduce_dim = list(range(1, len(input.shape)))
-    inse = reduce_sum(input * label, dim=reduce_dim)
-    dice_denominator = reduce_sum(
-        input, dim=reduce_dim) + reduce_sum(
-            label, dim=reduce_dim)
-    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
-    return reduce_mean(dice_score)
+    return paddle.nn.functional.dice_loss(
+        input, label, epsilon=epsilon, name=name)
 
 
 def image_resize(input,
@@ -13603,22 +13581,7 @@ def log_loss(input, label, epsilon=1e-4, name=None):
           prob = paddle.randn((10,1))
           cost = F.log_loss(input=prob, label=label)
     """
-    if in_dygraph_mode():
-        return _C_ops.final_state_log_loss(input, label, epsilon)
-
-    helper = LayerHelper('log_loss', **locals())
-    check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
-    check_variable_and_dtype(label, 'label', ['float32'], 'log_loss')
-
-    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-
-    helper.append_op(
-        type='log_loss',
-        inputs={'Predicted': [input],
-                'Labels': [label]},
-        outputs={'Loss': [loss]},
-        attrs={'epsilon': epsilon})
-    return loss
+    return paddle.nn.functional.log_loss(input, label, epsilon, name)
 
 
 def add_position_encoding(input, alpha, beta, name=None):
@@ -13922,33 +13885,8 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
-    if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
-    if _non_static_mode():
-        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
-                                     shift_ratio, 'data_format', data_format)
-
-    helper = LayerHelper("temporal_shift", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
-    check_type(seg_num, 'seg_num', int, 'temporal_shift')
-    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    if not isinstance(seg_num, int):
-        raise TypeError("seg_num must be int type.")
-
-    helper.append_op(
-        type="temporal_shift",
-        inputs={"X": x},
-        outputs={"Out": out},
-        attrs={
-            "seg_num": seg_num,
-            "shift_ratio": shift_ratio,
-            "data_format": data_format
-        })
-    return out
+    return paddle.nn.functional.temporal_shift(x, seg_num, shift_ratio, name,
+                                               data_format)
 
 
 class PyFuncRegistry(object):
@@ -15076,63 +15014,8 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
-    helper = LayerHelper("unfold", **locals())
-
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
-
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
-
-    if isinstance(kernel_sizes, int):
-        kernel_sizes = [kernel_sizes, kernel_sizes]
-    else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
-
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
-
-    if isinstance(dilations, int):
-        dilations = [dilations, dilations]
-    else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
-
-    if isinstance(paddings, int):
-        paddings = [paddings] * 4
-    elif isinstance(paddings, list):
-        if len(paddings) == 2:
-            paddings = paddings * 2
-        elif len(paddings) == 4:
-            pass
-        else:
-            raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
-            )
-    else:
-        raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
-
-    if in_dygraph_mode():
-        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
-                                         dilations)
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="unfold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
-    return out
+    return paddle.nn.functional.unfold(x, kernel_sizes, strides, paddings,
+                                       dilations, name)
 
 
 def deformable_roi_pooling(input,
@@ -15584,26 +15467,7 @@ def gather_tree(ids, parents):
             # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
 
     """
-    if in_dygraph_mode():
-        return _C_ops.final_state_gather_tree(ids, parents)
-    else:
-        if _in_legacy_dygraph():
-            return _C_ops.gather_tree(ids, parents)
-        else:
-            helper = LayerHelper('gather_tree', **locals())
-            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
-                                     'gather_tree')
-            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
-                                     'gather_tree')
-            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
-
-            helper.append_op(
-                type="gather_tree",
-                inputs={"Ids": ids,
-                        "Parents": parents},
-                outputs={"Out": out})
-
-            return out
+    return paddle.nn.functional.gather_tree(ids, parents)
 
 
 @deprecated(since="2.0.0", update_to="paddle.uniform")
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index 80dc990af4556..702e38f3d2368 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 from .layer_function_generator import templatedoc
 from ..framework import core, Variable, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph, convert_np_dtype_to_dtype_
 from ..layer_helper import LayerHelper
@@ -1382,35 +1383,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
 
     """
 
-    if in_dygraph_mode():
-        if not isinstance(dtype, core.VarDesc.VarType):
-            dtype = convert_np_dtype_to_dtype_(dtype)
-        if maxlen is not None:
-            if isinstance(maxlen, core.eager.Tensor):
-                attrs = ('out_dtype', dtype)
-                out = _C_ops.sequence_mask(x, maxlen, *attrs)
-            else:
-                attrs = ('out_dtype', dtype, 'maxlen', maxlen)
-                out = _C_ops.sequence_mask(x, None, *attrs)
-            out.stop_gradient = True
-            return out
-
-    helper = LayerHelper('sequence_mask', **locals())
-    out = helper.create_variable_for_type_inference(dtype=dtype)
-
-    inputs = {'X': [x]}
-    attrs = {'out_dtype': out.dtype}
-    if maxlen is not None:
-        if isinstance(maxlen, Variable):
-            inputs['MaxLenTensor'] = maxlen
-        else:
-            attrs['maxlen'] = maxlen
-
-    helper.append_op(
-        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
-
-    out.stop_gradient = True
-    return out
+    return paddle.nn.functional.sequence_mask(x, maxlen, dtype, name)
 
 
 @templatedoc()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index ffd1607fe87b4..a3584a73dfae1 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -55,5 +55,6 @@
 
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
 from ..fluid.framework import in_dygraph_mode  # noqa: F401
+from ..fluid.framework import _in_legacy_dygraph  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 44acf32894588..fa5a56c468620 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -119,8 +119,8 @@
 from .vision import channel_shuffle  # noqa: F401
 from .input import one_hot  # noqa: F401
 from .input import embedding  # noqa: F401
-from ...fluid.layers import gather_tree  # noqa: F401
-from ...fluid.layers import temporal_shift  # noqa: F401
+from .extension import gather_tree  # noqa: F401
+from .extension import temporal_shift  # noqa: F401
 
 from .sparse_attention import sparse_attention
 
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 0dcc43565f25a..dd314868b69e2 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.layers import sigmoid  # noqa: F401
+from ...tensor.ops import sigmoid  # noqa: F401
 from ...tensor.math import tanh  # noqa: F401
 from ...tensor.math import tanh_  # noqa: F401
 
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index fe37b8fb97c3d..7fed1dbb487fa 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -21,7 +21,6 @@
 from paddle.static import Variable
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-from ...fluid.layers import unfold  # noqa: F401
 from ...tensor.manipulation import squeeze
 from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
@@ -31,8 +30,6 @@
 from ...fluid.framework import _varbase_creator, _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
 
 from ...fluid import dygraph_utils
-from ...fluid import layers
-from ...fluid.data_feeder import check_variable_and_dtype
 
 from paddle import _C_ops
 from paddle.framework import in_dynamic_mode
@@ -44,6 +41,135 @@
 __all__ = []
 
 
+def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
+    r"""
+
+    This op returns a col buffer of sliding local blocks of input x, also known
+    as im2col for batched 2D image tensors. For each block under the convolution filter,
+    all element will be rearranged as a column. While the convolution filter sliding over
+    the input feature map, a series of such columns will be formed.
+
+    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
+    can be calculated as following.
+
+    .. math::
+
+        dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1
+
+        dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1
+
+        hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
+
+        wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
+
+        Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1]
+
+        Lout &= hout \times wout
+
+
+    Parameters:
+        x(Tensor):              4-D Tensor, input tensor of format [N, C, H, W],
+                                  data type can be float32 or float64
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        The tensor corresponding to the sliding local blocks.
+        The output shape is [N, Cout, Lout] as decriabled above.
+        Cout is the  total number of values within each block,
+        and Lout is the total number of such blocks.
+        The data type of output is the same as the input :math:`x`
+
+    Return Type:
+        Tensor
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn((100,3,224,224))
+            y = F.unfold(x, [3, 3], 1, 1, 1)
+    """
+
+    helper = LayerHelper("unfold", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
+
+    assert len(x.shape) == 4, \
+            "input should be the format of [N, C, H, W]"
+
+    if isinstance(kernel_sizes, int):
+        kernel_sizes = [kernel_sizes, kernel_sizes]
+    else:
+        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
+            "kernel_sizes should either be an integer or a list of two integers"
+
+    if isinstance(strides, int):
+        strides = [strides, strides]
+    else:
+        assert isinstance(strides, list) and (len(strides) == 2), \
+            "strides should either be an integer or a list of two integers"
+
+    if isinstance(dilations, int):
+        dilations = [dilations, dilations]
+    else:
+        assert isinstance(dilations, list) and (len(dilations) == 2), \
+            "dilations should either be an integer or a list of two integers"
+
+    if isinstance(paddings, int):
+        paddings = [paddings] * 4
+    elif isinstance(paddings, list):
+        if len(paddings) == 2:
+            paddings = paddings * 2
+        elif len(paddings) == 4:
+            pass
+        else:
+            raise ValueError(
+                "paddings should either be an integer or a list of 2 or 4 integers"
+            )
+    else:
+        raise ValueError(
+            "Unexpected type of paddings, it should be either an integer or a list"
+            "of 2 or 4 integers")
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_unfold(x, kernel_sizes, strides, paddings,
+                                         dilations)
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="unfold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations
+        })
+    return out
+
+
 def interpolate(x,
                 size=None,
                 scale_factor=None,
@@ -1295,7 +1421,23 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     if mode == "constant" and isinstance(pad, (
             list, tuple)) and len(pad) == x_dim * 2:
-        return layers.pad(x, pad, pad_value=value)
+        paddings = pad
+        pad_value = value
+        check_variable_and_dtype(x, 'x', [
+            'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+            'complex128'
+        ], "pad")
+
+        helper = LayerHelper('pad', **locals())
+        dtype = helper.input_dtype(input_param_name='x')
+        out = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type='pad',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'paddings': paddings,
+                   'pad_value': float(pad_value)})
+        return out
 
     assert x_dim in [
         3, 4, 5
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 2483eab6c053a..5a6bf4c0fa650 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -21,8 +21,12 @@
 from ...tensor.creation import assign
 from ...fluid import dygraph_utils
 from ...tensor.layer_function_generator import templatedoc
-from ...fluid.layers.sequence_lod import sequence_mask  #noqa: F401
 from paddle import in_dynamic_mode
+from paddle import _C_ops
+from ...fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from ...fluid.data_feeder import check_variable_and_dtype, check_type
+from ...framework import core
+from ...common_ops_import import convert_np_dtype_to_dtype_
 
 __all__ = []
 
@@ -140,3 +144,240 @@ def __check_input(input, offset, dim1, dim2):
         outputs={'Out': [out]})
     out.stop_gradient = True
     return out
+
+
+def sequence_mask(x, maxlen=None, dtype='int64', name=None):
+    r"""
+    **SequenceMask Layer**
+
+    This layer outputs a mask according to the input :code:`x` and
+    :code:`maxlen` with data type of :code:`dtype`.
+
+    Supposing :code:`x` is a Tensor with shape [d_1, d_2, ..., d_n], the
+    :code:`y` is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
+
+    .. math::
+
+        y(i_1, i_2,..., i_n, j) = (j < x(i_1, i_2,..., i_n))
+
+    .. code-block:: text
+
+        Case:
+
+        Consider input:
+            x = [3, 1, 1, 0]    max_len = 4
+
+        then we get out:
+            mask = [[1, 1, 1, 0],
+                    [1, 0, 0, 0],
+                    [1, 0, 0, 0],
+                    [0, 0, 0, 0]]
+
+    Args:
+        x (Variable): Input tensor of sequence_mask layer, \
+            whose elements are integers less than :code:`maxlen`. \
+            Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
+        maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
+                           is None, it would be replace with :math:`max(x)`.
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
+             ``int64`` by default.
+        name(str, optional): For detailed information, please refer \
+            to :ref:`api_guide_Name`. Usually name is no need to set and \
+            None by default.
+
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
+            int32 or int64.
+
+    Return Type: Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
+
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
+
+    """
+
+    if in_dygraph_mode():
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        if maxlen is not None:
+            if isinstance(maxlen, core.eager.Tensor):
+                attrs = ('out_dtype', dtype)
+                out = _C_ops.sequence_mask(x, maxlen, *attrs)
+            else:
+                attrs = ('out_dtype', dtype, 'maxlen', maxlen)
+                out = _C_ops.sequence_mask(x, None, *attrs)
+            out.stop_gradient = True
+            return out
+
+    helper = LayerHelper('sequence_mask', **locals())
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {'X': [x]}
+    attrs = {'out_dtype': out.dtype}
+    if maxlen is not None:
+        if isinstance(maxlen, Variable):
+            inputs['MaxLenTensor'] = maxlen
+        else:
+            attrs['maxlen'] = maxlen
+
+    helper.append_op(
+        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs)
+
+    out.stop_gradient = True
+    return out
+
+
+def gather_tree(ids, parents):
+    r"""
+    To be used after beam search. After beam search, we get selected ids at
+    each time step and the corresponding parents in the search tree. Both ids
+    and parents have the layout :attr:`[max_time, batch_size, beam_size]`. Then
+    :attr:`gather_tree` is used to backtrace from the last time step and
+    generate the full sequences by collecting selected ids.
+
+    Here is an example:
+
+    .. code-block:: text
+
+            Given:
+                ids = [[[2 2]
+                        [6 1]]
+                       [[3 9]
+                        [6 1]]
+                       [[0 1]
+                        [9 0]]]
+                parents = [[[0 0]
+                            [1 1]]
+                           [[1 0]
+                            [1 0]]
+                           [[0 0]
+                            [0 1]]]
+
+            Then:
+                gather_tree(ids, parents)
+                         = [[[2 2]
+                             [1 6]]
+                            [[3 3]
+                             [6 1]]
+                            [[0 1]
+                             [9 0]]]
+
+    Args:
+        ids(Tensor): A Tensor with shape :attr:`[length, batch_size, beam_size]`
+            and data type :attr:`int32` or :attr:`int64`. It contains the selected
+            ids of all time steps.
+        parents(Tensor): A Tensor with the same shape and data type as :attr:`ids`,
+            It contains the parents corresponding to selected ids when searching
+            among beams.
+
+    Returns:
+            A Tensor with the same shape and data type as :attr:`ids`. \
+            It contains the full sequences. The sequences are collected from \
+            :attr:`ids` by backtracing according to :attr:`parents`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            ids = paddle.to_tensor([[[2, 2], [6, 1]], [[3, 9], [6, 1]], [[0, 1], [9, 0]]])
+
+            parents = paddle.to_tensor([[[0, 0], [1, 1]], [[1, 0], [1, 0]], [[0, 0], [0, 1]]])
+
+            final_sequences = paddle.nn.functional.gather_tree(ids, parents)
+            # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
+
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_gather_tree(ids, parents)
+    else:
+        if _in_legacy_dygraph():
+            return _C_ops.gather_tree(ids, parents)
+        else:
+            helper = LayerHelper('gather_tree', **locals())
+            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
+                                     'gather_tree')
+            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
+                                     'gather_tree')
+            out = helper.create_variable_for_type_inference(dtype=ids.dtype)
+
+            helper.append_op(
+                type="gather_tree",
+                inputs={"Ids": ids,
+                        "Parents": parents},
+                outputs={"Out": out})
+
+            return out
+
+
+@templatedoc()
+def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
+    """
+
+    **Temporal Shift Operator**
+
+    ${comment}
+
+    Args:
+        x(Tensor): ${x_comment}
+        seg_num(int): ${seg_num_comment}
+        shift_ratio(float): ${shift_ratio_comment}
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Returns:
+        out(Tensor): The temporal shifting result is a tensor with the
+        same shape and same data type as the input.
+
+    Raises:
+        TypeError: seg_num must be int type.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([6, 4, 2, 2])
+            out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+    """
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+    if _non_static_mode():
+        return _C_ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                     shift_ratio, 'data_format', data_format)
+
+    helper = LayerHelper("temporal_shift", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
+    check_type(seg_num, 'seg_num', int, 'temporal_shift')
+    check_type(shift_ratio, 'shift_ratio', float, 'temporal_shift')
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(seg_num, int):
+        raise TypeError("seg_num must be int type.")
+
+    helper.append_op(
+        type="temporal_shift",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format
+        })
+    return out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d08821e510c2b..c0527a7a65201 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -21,15 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
-from ...fluid.layers import dice_loss  # noqa: F401
-from ...fluid.layers import log_loss  # noqa: F401
-from ...fluid.layers import npair_loss  # noqa: F401
 from ...tensor.manipulation import reshape
-from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy
-from ...fluid.layers import square_error_cost  # noqa: F401
-
-from ...fluid.layers import edit_distance  # noqa: F401
-from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import _varbase_creator
 from ...static import Variable
@@ -41,6 +33,518 @@
 __all__ = []
 
 
+def dice_loss(input, label, epsilon=0.00001, name=None):
+    r"""
+
+    Dice loss for comparing the similarity between the input predictions and the label.
+    This implementation is for binary classification, where the input is sigmoid
+    predictions of each pixel, usually used for segmentation task. The dice loss can
+    be defined as the following equation:
+
+    .. math::
+
+        dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\
+                  &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\
+                  &= \frac{(union\_area - intersection\_area)}{total\_area}
+
+
+    Parameters:
+        input (Tensor): Tensor, rank>=2, shape is :math:`[N_1, N_2, ..., N_k, D]`, where :math:`N_1` is
+                          the batch_size, :math:`D` is the number of categories. It is usually the output
+                          predictions of sigmoid activation. The data type can be float32 or float64.
+        label (Tensor): Tensor, the groud truth with the same rank as input, shape is :math:`[N_1, N_2, ..., N_k, 1]`.
+                          where :math:`N_1` is the batch_size. The data type can be int32 or int64.
+        epsilon (float): The epsilon will be added to the numerator and denominator.
+                         If both input and label are empty, it makes sure dice is 1.
+                         Default: 0.00001
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor, which shape is [1], data type is the same as `input` .
+
+    Example:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            x = paddle.randn((3,224,224,2))
+            label = paddle.randint(high=2, shape=(3,224,224,1))
+            predictions = F.softmax(x)
+            loss = F.dice_loss(input=predictions, label=label)
+    """
+    assert input.dtype in (paddle.float32, paddle.float64)
+    assert label.dtype in (paddle.int32, paddle.int64)
+    assert len(input.shape) >= 2, \
+        "The rank of input should be greater than or equal to 2."
+    assert len(input.shape) == len(label.shape), (
+        "The rank of input and label should be equal, "
+        "but received input: %d, label: %d." %
+        (len(input.shape), len(label.shape)))
+    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
+                                  "but received %d." % label.shape[-1])
+    assert input.shape[:-1] == label.shape[:-1], (
+        "All dimensions should be equal except the last one.")
+    assert input.numel() > 0 and label.numel() > 0, \
+        "Any dimension of input and label cannot be equal to 0."
+
+    label = paddle.squeeze(label, [-1])
+    label = paddle.nn.functional.one_hot(label, input.shape[-1])
+    reduce_dim = list(range(1, len(input.shape)))
+    inse = paddle.sum(input * label, axis=reduce_dim)
+    dice_denominator = paddle.sum(input, axis=reduce_dim) + paddle.sum(
+        label, axis=reduce_dim)
+    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
+    return paddle.mean(dice_score)
+
+
+def log_loss(input, label, epsilon=1e-4, name=None):
+    r"""
+
+    **Negative Log Loss Layer**
+
+    This layer accepts input predictions and target label and returns the
+    negative log loss.
+
+    .. math::
+
+        Out = -label * \log{(input + \epsilon)}
+              - (1 - label) * \log{(1 - input + \epsilon)}
+
+    Args:
+        input (Tensor|list):  A 2-D tensor with shape [N x 1], where N is the
+                                batch size. This input is a probability computed
+                                by the previous operator. Data type float32.
+        label (Tensor|list):  The ground truth which is a 2-D tensor with
+                                shape [N x 1], where N is the batch size.
+                                Data type float32.
+        epsilon (float, optional): A small number for numerical stability. Default 1e-4.
+        name(str|None): For detailed information, please refer to
+            :ref:`api_guide_Name` . Usually name is no need to set and None by default.
+
+    Returns:
+        Tensor, which shape is [N x 1], data type is float32.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn.functional as F
+
+          label = paddle.randn((10,1))
+          prob = paddle.randn((10,1))
+          cost = F.log_loss(input=prob, label=label)
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_log_loss(input, label, epsilon)
+
+    helper = LayerHelper('log_loss', **locals())
+    check_variable_and_dtype(input, 'input', ['float32'], 'log_loss')
+    check_variable_and_dtype(label, 'label', ['float32'], 'log_loss')
+
+    loss = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input],
+                'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon})
+    return loss
+
+
+def fluid_softmax_with_cross_entropy(logits,
+                                     label,
+                                     soft_label=False,
+                                     ignore_index=-100,
+                                     numeric_stable_mode=True,
+                                     return_softmax=False,
+                                     axis=-1):
+    r"""
+
+    This operator implements the cross entropy loss function with softmax. This function 
+    combines the calculation of the softmax operation and the cross entropy loss function 
+    to provide a more numerically stable gradient.
+
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
+
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
+
+    The equation is as follows:
+
+    1) Hard label (one-hot label, so every sample has exactly one class)
+
+    .. math::
+
+        loss_j =  -\\text{logits}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+
+    2) Soft label (each sample can have a distribution over all classes)
+
+    .. math::
+
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+
+    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated first by:
+
+    .. math::
+
+        max_j &= \\max_{i=0}^{K}{\\text{logits}_i}
+
+        log\\_max\\_sum_j &= \\log\\sum_{i=0}^{K}\\exp(logits_i - max_j)
+
+        softmax_j &= \\exp(logits_j - max_j - {log\\_max\\_sum}_j)
+
+    and then cross entropy loss is calculated by softmax and label.
+
+    Args:
+        logits (Tensor): A multi-dimension ``Tensor`` , and the data type is float32 or float64. The input tensor of unscaled log probabilities.
+        label (Tensor): The ground truth  ``Tensor`` , data type is the same
+            as the ``logits`` . If :attr:`soft_label` is set to :attr:`True`, 
+            Label is a ``Tensor``  in the same shape with :attr:`logits`. 
+            If :attr:`soft_label` is set to :attr:`True`, Label is a ``Tensor`` 
+            in the same shape with :attr:`logits` expect shape in dimension :attr:`axis` as 1.
+        soft_label (bool, optional): A flag to indicate whether to interpretant the given
+            labels as soft labels. Default False.
+        ignore_index (int, optional): Specifies a target value that is ignored and does
+                                      not contribute to the input gradient. Only valid
+                                      if :attr:`soft_label` is set to :attr:`False`. 
+                                      Default: kIgnoreIndex(-100).
+        numeric_stable_mode (bool, optional): A flag to indicate whether to use a more
+                                              numerically stable algorithm. Only valid
+                                              when :attr:`soft_label` is :attr:`False` 
+                                              and GPU is used. When :attr:`soft_label` 
+                                              is :attr:`True` or CPU is used, the 
+                                              algorithm is always numerically stable.
+                                              Note that the speed may be slower when use
+                                              stable algorithm. Default: True.
+        return_softmax (bool, optional): A flag indicating whether to return the softmax
+                                         along with the cross entropy loss. Default: False.
+        axis (int, optional): The index of dimension to perform softmax calculations. It 
+                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                              is the rank of input :attr:`logits`. Default: -1.
+
+    Returns:
+        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
+                                                    `return_softmax` is False, otherwise the tuple \
+                                                    (loss, softmax), softmax is in the same shape \
+                                                    with input logits and cross entropy loss is in \
+                                                    the same shape with input logits except shape \
+                                                    in dimension :attr:`axis` as 1.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            data = np.random.rand(128).astype("float32")
+            label = np.random.rand(1).astype("int64")
+            data = paddle.to_tensor(data)
+            label = paddle.to_tensor(label)
+            linear = paddle.nn.Linear(128, 100)
+            x = linear(data)
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
+            print(out)
+    """
+    if _non_static_mode():
+        if core.is_compiled_with_npu():
+            softmax, backprop, loss = _C_ops.softmax_with_cross_entropy(
+                logits, label, 'soft_label', soft_label, 'ignore_index',
+                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                'axis', axis)
+        else:
+            if in_dygraph_mode():
+                softmax, loss = _C_ops.final_state_cross_entropy_with_softmax(
+                    logits, label, soft_label, True, numeric_stable_mode,
+                    ignore_index, axis)
+            if _in_legacy_dygraph():
+                softmax, loss = _C_ops.softmax_with_cross_entropy(
+                    logits, label, 'soft_label', soft_label, 'ignore_index',
+                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
+                    'axis', axis)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': numeric_stable_mode,
+        'axis': axis
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+
+    outputs = {'Softmax': softmax, 'Loss': loss}
+    if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
+        backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
+        outputs['Backprop'] = backprop
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs=outputs,
+        attrs=attrs)
+
+    if return_softmax:
+        return loss, softmax
+
+    return loss
+
+
+def npair_loss(anchor, positive, labels, l2_reg=0.002):
+    """ 
+  
+    Npair loss requires paired data. Npair loss has two parts: the first part is L2
+    regularizer on the embedding vector; the second part is cross entropy loss which
+    takes the similarity matrix of anchor and positive as logits.
+  
+    For more information, please refer to:
+    `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
+  
+    Args:
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+                        the data type is float32 or float64.
+      labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
+      l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
+
+  
+    Returns:
+      A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
+  
+    Examples:
+
+      .. code-block:: python
+  
+          import paddle
+          
+          DATATYPE = "float32"
+  
+          anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
+          labels = paddle.rand(shape=(18,), dtype=DATATYPE)
+          
+          npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
+          print(npair_loss)
+  
+    """
+    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
+                             'npair_loss')
+    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
+                             'positive')
+    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
+                             'labels')
+    Beta = 0.25
+    batch_size = labels.shape[0]
+
+    labels = paddle.reshape(labels, shape=[batch_size, 1])
+    labels = paddle.tile(labels, repeat_times=[1, batch_size])
+
+    labels = paddle.equal(
+        labels, paddle.transpose(
+            labels, perm=[1, 0])).astype('float32')
+    labels = labels / paddle.sum(labels, axis=1, keepdim=True)
+
+    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) \
+             + paddle.mean(paddle.sum(paddle.square(positive), 1))
+    l2loss = l2loss * Beta * l2_reg
+
+    similarity_matrix = paddle.matmul(
+        anchor, positive, transpose_x=False, transpose_y=True)
+    softmax_ce = fluid_softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True)
+    cross_entropy = paddle.sum(labels * softmax_ce, 0)
+    celoss = paddle.mean(cross_entropy)
+
+    return l2loss + celoss
+
+
+def square_error_cost(input, label):
+    r"""
+
+    This op accepts input predictions and target label and returns the
+    squared error cost.
+
+    For predictions label, and target label, the equation is:
+
+    .. math::
+
+        Out = (input - label)^2
+
+    Parameters:
+        input (Tensor): Input tensor, the data type should be float32.
+        label (Tensor): Label tensor, the data type should be float32.
+
+    Returns:
+        The tensor storing the element-wise squared error \
+                  difference between input and label.
+
+    Return type: Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            input = paddle.to_tensor([1.1, 1.9])
+            label = paddle.to_tensor([1.0, 2.0])
+            output = paddle.nn.functional.square_error_cost(input, label)
+            print(output)
+            # [0.01, 0.01]
+
+    """
+    if _non_static_mode():
+        minus_out = _C_ops.elementwise_sub(input, label)
+        square_out = _C_ops.square(minus_out)
+        return square_out
+
+    check_variable_and_dtype(input, "input", ['float32', 'float64'],
+                             'square_error_cost')
+    check_variable_and_dtype(label, "label", ['float32', 'float64'],
+                             'square_error_cost')
+    helper = LayerHelper('square_error_cost', **locals())
+    minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
+    return square_out
+
+
+def edit_distance(input,
+                  label,
+                  normalized=True,
+                  ignored_tokens=None,
+                  input_length=None,
+                  label_length=None):
+    """
+    This op computes the edit distances, also called Levenshtein distance, between a batch of
+    hypothesis strings and their references. It measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into another.
+    The operations include insertion, deletion, and substitution.
+
+    For example, given hypothesis string A = "kitten" and reference
+    B = "sitting", A will be transformed into B
+    at least after two substitutions and one insertion:
+
+    "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+    So the edit distance between A and B is 3.
+
+    The input is a Tensor, the input_length and label_length should be supported.
+
+    The `batch_size` of labels should be same as `input`.
+
+    The output include the edit distance value between every pair of input and related label, and the number of sequence.
+    If Attr(normalized) is true,
+    the edit distance value will be divided by the length of label.
+
+    Parameters:
+        input(Tensor): The input tensor, its rank should be equal to 2 and its data type should be int64.
+        label(Tensor): The label tensor, its rank should be equal to 2 and its data type should be int64.
+        normalized(bool, default True): Indicated whether to normalize the edit distance.
+        ignored_tokens(list<int>, default None): Tokens that will be removed before
+                                     calculating edit distance.
+        input_length(Tensor): The length for each sequence in `input` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        label_length(Tensor): The length for each sequence in `label` if it's of Tensor type, it should have shape `(batch_size, )` and its data type should be int64.
+        NOTE: To be avoid unexpected result, the value of every elements in input_length and label_length should be equal to the value of the second dimension of input and label. For example, The input: [[1,2,3,4],[5,6,7,8],[9,10,11,12]], the shape of input is [3,4] and the input_length should be [4,4,4]
+        NOTE: This Api is different from fluid.metrics.EditDistance
+
+    Returns:
+	Tuple:
+
+        distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
+        sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1,2,3],[4,5,6],[4,4,4],[1,1,1]], dtype='int64')
+            label = paddle.to_tensor([[1,3,4,1],[4,5,8,1],[7,7,7,1],[1,1,1,1]], dtype='int64')
+            input_len = paddle.to_tensor([3,3,3,3], dtype='int64')
+            label_len = paddle.to_tensor([4,4,4,4], dtype='int64')
+
+            distance, sequence_num = F.loss.edit_distance(input=input, label=label, input_length=input_len, label_length=label_len, normalized=False)
+
+            # print(distance)
+            # [[3.]
+            #  [2.]
+            #  [4.]
+            #  [1.]]
+            # if set normalized to True
+            # [[0.75]
+            #  [0.5 ]
+            #  [1.  ]
+            #  [0.25]
+            #
+            # print(sequence_num)
+            # [4]
+
+    """
+    check_variable_and_dtype(input, 'input', ['int64'], 'edit_distance')
+    check_variable_and_dtype(label, 'label', ['int64'], 'edit_distance')
+    helper = LayerHelper("edit_distance", **locals())
+
+    # remove some tokens from input and labels
+    if ignored_tokens is not None and len(ignored_tokens) > 0:
+        erased_input = helper.create_variable_for_type_inference(dtype="int64")
+        erased_label = helper.create_variable_for_type_inference(dtype="int64")
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens})
+        input = erased_input
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erased_label]},
+            attrs={"tokens": ignored_tokens})
+        label = erased_label
+
+    this_inputs = {"Hyps": [input], "Refs": [label]}
+    if input_length is not None and label_length is not None:
+        this_inputs['HypsLength'] = [input_length]
+        this_inputs['RefsLength'] = [label_length]
+
+    # edit distance op
+    edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
+    sequence_num = helper.create_variable_for_type_inference(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs=this_inputs,
+        outputs={"Out": [edit_distance_out],
+                 "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out, sequence_num
+
+
 def binary_cross_entropy(input, label, weight=None, reduction='mean',
                          name=None):
     """
@@ -138,10 +642,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
             else:
                 return out
         else:
-            fluid.data_feeder.check_variable_and_dtype(
-                input, 'input', ['float32', 'float64'], 'binary_cross_entropy')
-            fluid.data_feeder.check_variable_and_dtype(
-                label, 'label', ['float32', 'float64'], 'binary_cross_entropy')
+            check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                     'binary_cross_entropy')
+            check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                     'binary_cross_entropy')
 
             sub_name = name if weight is None and reduction == 'none' else None
             helper = LayerHelper("binary_cross_entropy", name=sub_name)
@@ -288,12 +792,10 @@ def binary_cross_entropy_with_logits(logit,
         else:
             return out
 
-    fluid.data_feeder.check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'],
-        'binary_cross_entropy_with_logits')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'],
-        'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
+                             'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'binary_cross_entropy_with_logits')
     sigmoid_name = None
     if reduction == 'none' and pos_weight is None and weight is None:
         sigmoid_name = name
@@ -303,18 +805,17 @@ def binary_cross_entropy_with_logits(logit,
 
     one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            pos_weight, 'pos_weight', ['float32', 'float64'],
-            'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(pos_weight, 'pos_weight',
+                                 ['float32', 'float64'],
+                                 'binary_cross_entropy_with_logits')
         log_weight = paddle.add(
             paddle.multiply(label, paddle.subtract(pos_weight, one)), one)
         pos_weight_name = name if reduction == 'none' and weight is None else None
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
 
     if weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'],
-            'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                 'binary_cross_entropy_with_logits')
         weight_name = name if reduction == 'none' else None
         out = paddle.multiply(out, weight, name=weight_name)
 
@@ -519,12 +1020,26 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
             output = paddle.nn.functional.smooth_l1_loss(input, label)
             print(output)
     """
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'smooth_l1_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'smooth_l1_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'smooth_l1_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'smooth_l1_loss')
 
-    out = huber_loss(input=input, label=label, delta=delta)
+    if in_dygraph_mode():
+        out, residual = _C_ops.final_state_huber_loss(input, label, delta)
+    else:
+        helper = LayerHelper('huber_loss', **locals())
+        residual = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+        helper.append_op(
+            type='huber_loss',
+            inputs={'X': input,
+                    'Y': label},
+            outputs={'Out': out,
+                     'Residual': residual},
+            attrs={'delta': delta})
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
@@ -615,12 +1130,12 @@ def margin_ranking_loss(input,
         return out
 
     helper = LayerHelper("margin_ranking_loss", **locals())
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'margin_rank_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        other, 'other', ['float32', 'float64'], 'margin_rank_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'margin_rank_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'margin_rank_loss')
+    check_variable_and_dtype(other, 'other', ['float32', 'float64'],
+                             'margin_rank_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'margin_rank_loss')
 
     out = paddle.subtract(other, input)
     out = paddle.multiply(out, label)
@@ -738,9 +1253,9 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
 
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(
         input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(
         label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss')
 
     if reduction == 'sum':
@@ -847,10 +1362,8 @@ def nll_loss(input,
         label = reshape(label, shape=[n, 1, -1])
         out_shape = [n] + input_shape[2:]
 
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'nll_loss')
-    fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                               'nll_loss')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nll_loss')
+    check_variable_and_dtype(label, 'label', ['int64'], 'nll_loss')
     inputs = {'X': input, 'Label': label}
     attrs = {'reduction': reduction, 'ignore_index': ignore_index}
     if weight is not None:
@@ -971,10 +1484,8 @@ def kl_div(input, label, reduction='mean', name=None):
 
     helper = LayerHelper('kl_div', **locals())
 
-    fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                               ['float32', 'float64'], 'kl_div')
-    fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                               ['float32', 'float64'], 'kl_div')
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'kl_div')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'], 'kl_div')
     fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -1051,10 +1562,10 @@ def mse_loss(input, label, reduction='mean', name=None):
             "but received {}.".format(reduction))
 
     if not in_dynamic_mode():
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'mse_loss')
-        paddle.fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['float32', 'float64'], 'mse_loss')
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'mse_loss')
+        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                                 'mse_loss')
 
     if reduction == 'none':
         return paddle.square(paddle.subtract(input, label), name=name)
@@ -1858,9 +2369,9 @@ def cross_entropy(input,
                 out = paddle.squeeze(out, axis=axis)
             return out
 
-    fluid.data_feeder.check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'softmax_cross_entropy')
-    fluid.data_feeder.check_variable_and_dtype(
+    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                             'softmax_cross_entropy')
+    check_variable_and_dtype(
         label, 'label',
         ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
         'softmax_cross_entropy')
@@ -1887,8 +2398,8 @@ def cross_entropy(input,
         attrs=attrs)
 
     if weight is not None:
-        fluid.data_feeder.check_variable_and_dtype(
-            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
+        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
+                                 'softmax_cross_entropy')
         weight_name = name if reduction == 'none' else None
         if soft_label == True:
             # chajchaj:
@@ -2050,9 +2561,8 @@ def sigmoid_focal_loss(logit,
             % reduction)
 
     if normalizer is not None:
-        fluid.data_feeder.check_variable_and_dtype(normalizer, 'normalizer',
-                                                   ['float32', 'float64'],
-                                                   'sigmoid_focal_loss')
+        check_variable_and_dtype(normalizer, 'normalizer',
+                                 ['float32', 'float64'], 'sigmoid_focal_loss')
         normalizer_shape = list(normalizer.shape)
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
@@ -2102,10 +2612,10 @@ def sigmoid_focal_loss(logit,
 
         return loss
 
-    fluid.data_feeder.check_variable_and_dtype(
-        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss')
-    fluid.data_feeder.check_variable_and_dtype(
-        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss')
+    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
+                             'sigmoid_focal_loss')
+    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
+                             'sigmoid_focal_loss')
 
     bce_name = None
     if reduction == 'none' and normalizer is None:
diff --git a/python/paddle/tensor/layer_function_generator.py b/python/paddle/tensor/layer_function_generator.py
index 7f95dd60eda8a..72e5eb640125d 100644
--- a/python/paddle/tensor/layer_function_generator.py
+++ b/python/paddle/tensor/layer_function_generator.py
@@ -21,7 +21,7 @@
 from six.moves import cStringIO
 from ..static import Variable
 from ..fluid.proto import framework_pb2
-from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode
+from ..framework import OpProtoHolder, core, convert_np_dtype_to_dtype_, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
 from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 import paddle
@@ -271,9 +271,10 @@ def func(x, name=None):
                                      op_type)
         else:
             # abs exp square ops support dtype(int32, int64, float16, float32, float64)
-            check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-                op_type)
+            check_variable_and_dtype(x, 'x', [
+                'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
+                'complex128'
+            ], op_type)
 
         helper = LayerHelper(op_type, **locals())
 
@@ -302,7 +303,7 @@ def generate_inplace_fn(inplace_op_type):
     origin_op_type = inplace_op_type[:-1]
 
     def func(x, name=None):
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
             op = getattr(_C_ops, inplace_op_type)
             return op(x)
         warnings.warn(

From c4b7c4852e85673b2ced5f1d5ba24ae575aa1c75 Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Wed, 1 Jun 2022 12:22:35 +0800
Subject: [PATCH 109/109] Add pinned memory to host memory stats (#43096)

* Add pinned memory to HostMemoryStats

* Add macro for WrapStatAllocator

* Fix CI errors
---
 paddle/fluid/memory/allocation/allocator_facade.cc | 8 +++++++-
 paddle/fluid/memory/allocation/pinned_allocator.cc | 4 +++-
 paddle/fluid/memory/allocation/stat_allocator.h    | 8 +++++---
 paddle/fluid/memory/detail/system_allocator.cc     | 2 ++
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 99152607158eb..46e1a500e4870 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -931,7 +931,13 @@ class AllocatorFacadePrivate {
 
   void WrapStatAllocator() {
     for (auto& pair : allocators_) {
-      pair.second = std::make_shared<StatAllocator>(pair.second);
+      // Now memory stats is only supported for CPU and GPU
+      const platform::Place& place = pair.first;
+      if (platform::is_cpu_place(place) ||
+          platform::is_cuda_pinned_place(place) ||
+          platform::is_gpu_place(place)) {
+        pair.second = std::make_shared<StatAllocator>(pair.second);
+      }
     }
   }
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 276c6bb0e69b8..5e5aea6dab2cc 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
-
+#include "paddle/fluid/memory/stats.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -24,6 +24,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
   delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -33,6 +34,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 68209bbaabeca..8b54b961596c2 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -45,11 +45,13 @@ class StatAllocator : public Allocator {
     phi::Allocator::AllocationPtr allocation =
         underlying_allocator_->Allocate(size);
 
-    if (platform::is_cpu_place(allocation->place())) {
-      HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+    const platform::Place& place = allocation->place();
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
                               allocation->size());
     } else {
-      DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
+      DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
                                 allocation->size());
     }
     return allocation.release();
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 06038804e6efe..e1077d66c54ec 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -211,6 +211,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
   if (result == gpuSuccess) {
     *index = 1;  // PINNED memory
     cuda_pinnd_alloc_size_ += size;
+    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
     return p;
   } else {
     LOG(WARNING) << "cudaHostAlloc failed.";
@@ -255,6 +256,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
             err));
   }
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
 }
 
 bool CUDAPinnedAllocator::UseGpu() const { return false; }