diff --git a/AUTHORS.md b/AUTHORS.md
index e5481d83de190..a8ea5c46e94d2 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -57,6 +57,7 @@
 | reyoung | Yang Yu |
 | [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus |
 | [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek |
+| Silv3S | Slawomir Siwek |
 | sneaxiy | Jin-Le Zeng |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9002cb287e855..ff49ba164dd7f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,7 +100,11 @@ if(APPLE AND WITH_ARM)
 endif()
 
 if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    if(WITH_ARM_BRPC)
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+    else()
+        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+    endif()
 endif()
 
 if(WIN32)
@@ -386,7 +390,7 @@ if(WITH_DISTRIBUTE)
     if(LINUX)
         set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
     endif()
-    if(WITH_ASCEND_CL)
+    if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
         # disable WITH_PSCORE for NPU before include third_party
         MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
         set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
diff --git a/README.md b/README.md
index c4c5decec5430..21e0aba8b48bf 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
      [Click here to learn more](https://github.com/PaddlePaddle/Fleet)
 
 
-- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
+- **High-Performance Inference Engines for Comprehensive Deployment Environments**
 
    PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
      
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index 4641184fcf527..4d813a0726dc0 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -238,7 +238,7 @@ foreach (GCOV_FILE ${GCOV_FILES})
 	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
 
 	# Loads the gcov file as a list of lines.
-	# (We first open the file and replace all occurences of [] with _
+	# (We first open the file and replace all occurrences of [] with _
 	#  because CMake will fail to parse a line containing unmatched brackets...
 	#  also the \ to escaped \n in macros screws up things.)
 	# https://public.kitware.com/Bug/view.php?id=15369
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index be911eb7eaced..d5ccf1297922f 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index c1a7ba6d909e1..f9cac0579fec4 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -158,6 +158,10 @@ if(WITH_IPU)
     )
 endif()
 
+if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
+    set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
+endif()
+
 if(NOT APPLE)
     if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
         set(COMMON_FLAGS
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
old mode 100755
new mode 100644
index ba5734208123e..31f9b26e732d1
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+#include <chrono>
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -24,6 +25,8 @@ namespace paddle {
 namespace distributed {
 
 using Place = paddle::platform::Place;
+int ProcessGroupHeter::send_count = 0;
+int ProcessGroupHeter::recv_count = 0;
 
 std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
     int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
@@ -47,7 +50,8 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
 ProcessGroupHeter::ProcessGroupHeter(
     const std::shared_ptr<Store>& store, int rank, int size,
     const platform::Place& place, int gid, int local_rank, int local_size,
-    int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint)
+    int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint,
+    int src_rank, int dst_rank)
     : ProcessGroup(rank, size, place, gid),
       store_(store),
       local_rank_(local_rank),
@@ -55,7 +59,10 @@ ProcessGroupHeter::ProcessGroupHeter(
       gloo_rank_(gloo_rank),
       gloo_size_(gloo_size),
       with_switch_(with_switch),
-      switch_endpoint_(switch_endpoint) {
+      switch_endpoint_(switch_endpoint),
+      src_rank_(src_rank),
+      dst_rank_(dst_rank) {
+  return;
 #if defined(PADDLE_WITH_NCCL)
   inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size,
                                                  place_, IGNORE_ID);
@@ -246,5 +253,100 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
   return CreateTask(rank_, CommType::BROADCAST, in_tensors);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
+    std::vector<phi::DenseTensor>& in_tensors, int peer) {
+#if defined(PADDLE_WITH_NCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+#endif
+
+  PADDLE_ENFORCE_EQ(
+      in_tensors.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "For each send operation, there can only be one tensor to send."));
+  // Copy Tensor to cpu
+  auto start = std::chrono::high_resolution_clock::now();
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = in_tensors[0];
+  framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
+  PADDLE_ENFORCE_EQ(with_switch_, true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from gpu to cpu for send " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+
+  // Send to switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  int64_t tensor_size =
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
+  std::vector<int64_t> send_size;
+  send_size.push_back(tensor_size);
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(send_count++);
+  VLOG(2) << "tensor_name:" << tensor_name;
+  int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
+                          tensor_size);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
+                                "Send to the switch module error."));
+  return CreateTask(rank_, CommType::SEND, in_tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
+    std::vector<phi::DenseTensor>& out_tensors, int peer) {
+#if defined(PADDLE_WITH_NCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+#endif
+
+  PADDLE_ENFORCE_EQ(
+      out_tensors.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "For each rece operation, there can only be one tensor to receive."));
+
+  // Copy Tensor to cpu
+  phi::DenseTensor cpu_tensor;
+  auto& gpu_tensor = out_tensors[0];
+  cpu_tensor.Resize(gpu_tensor.dims());
+  cpu_tensor.set_layout(gpu_tensor.layout());
+  cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());
+
+  PADDLE_ENFORCE_EQ(with_switch_, true,
+                    platform::errors::PreconditionNotMet(
+                        "Gloo does not support the send operation."));
+  // recv from switch
+  HeterClient* client_ =
+      HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
+  auto id = src_rank_ * 10000 + dst_rank_;
+  std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
+                            std::string("_") + std::to_string(recv_count++);
+  VLOG(2) << "tensor_name: " << tensor_name;
+  auto start = std::chrono::high_resolution_clock::now();
+  int ret = client_->Recv(
+      gid_, {tensor_name}, cpu_tensor.data(),
+      cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
+                                "receive to the switch module error."));
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> diff = end - start;
+  double goodput = cpu_tensor.numel() *
+                   framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
+  VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
+  start = std::chrono::high_resolution_clock::now();
+  framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
+  end = std::chrono::high_resolution_clock::now();
+  diff = end - start;
+  VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
+          << ") from gpu to cpu for recv " << std::setw(9)
+          << " is: " << diff.count() << " s" << std::endl;
+  return CreateTask(rank_, CommType::RECV, out_tensors);
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index 640acdfb6a23b..89b0f078b4af5 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -83,7 +83,8 @@ class ProcessGroupHeter : public ProcessGroup {
   ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
                     const platform::Place& place, int gid, int local_rank,
                     int local_size, int gloo_rank, int gloo_size,
-                    bool with_switch, std::string switch_endpoints);
+                    bool with_switch, std::string switch_endpoints,
+                    int src_rank, int dst_rank);
 
   const std::string GetBackendName() const override {
     return std::string(HETER_BACKEND_NAME);
@@ -97,6 +98,12 @@ class ProcessGroupHeter : public ProcessGroup {
       std::vector<phi::DenseTensor>&, std::vector<phi::DenseTensor>&,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& in_tensors, int peer) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& out_tensors, int peer) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
       int rank, CommType opType, const std::vector<phi::DenseTensor>& inputs);
@@ -112,6 +119,10 @@ class ProcessGroupHeter : public ProcessGroup {
   int gloo_size_;
   bool with_switch_;
   std::string switch_endpoint_;
+  int src_rank_;
+  int dst_rank_;
+  static int send_count;
+  static int recv_count;
 };
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 75153df936b1c..a7c3e2208ab74 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -447,10 +447,12 @@ void EagerReducer::TraverseBackwardGraph(const std::vector<Tensor> &outputs) {
   while (!queue.empty()) {
     egr::GradNodeBase *node = queue.front();
     queue.pop();
-    const std::vector<std::vector<egr::Edge>> &edges = node->GetEdges();
-    for (size_t i = 0; i < edges.size(); i++) {
-      for (size_t j = 0; j < edges[i].size(); j++) {
-        const egr::Edge &edge = edges[i][j];
+    const paddle::small_vector<std::vector<egr::GradSlotMeta>,
+                               egr::kSlotSmallVectorSize> &metas =
+        node->OutputMeta();
+    for (size_t i = 0; i < metas.size(); i++) {
+      for (size_t j = 0; j < metas[i].size(); j++) {
+        const egr::Edge &edge = metas[i][j].GetEdge();
         auto next_node_shared = edge.GetMutableGradNode();
         if (!next_node_shared || !next_node_shared.get()) {
           continue;
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
index 8085ef68e1cad..fd0962caaaead 100755
--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -23,6 +23,8 @@ DEFINE_int32(switch_send_recv_timeout_s, 600, "switch_send_recv_timeout_s");
 namespace paddle {
 namespace distributed {
 std::shared_ptr<HeterClient> HeterClient::s_instance_ = nullptr;
+std::mutex HeterClient::mtx_;
+std::shared_ptr<HeterClient> HeterClient::switch_s_instance_ = nullptr;
 
 int GetMicroId(const platform::DeviceContext& ctx,
                const framework::Scope* scope) {
diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
index b9d65613399b2..36bafc943701f 100644
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -169,16 +169,22 @@ class HeterClient {
   }
 
   // switch client singleton
-  static HeterClient& GetSwitchInstance(
+  static std::shared_ptr<HeterClient> GetSwitchInstance(
       const std::vector<std::string>& peer_endpoints, int32_t peer_role) {
-    static HeterClient switch_s_instance_;
-    if (peer_endpoints.empty()) {
-      VLOG(4) << "init switch client failed, null peer_endpoints";
+    if (switch_s_instance_ == nullptr) {
+      std::unique_lock<std::mutex> lock(mtx_);
+      if (peer_endpoints.empty()) {
+        VLOG(4) << "init switch client failed, null peer_endpoints";
+      }
+      VLOG(4) << "peer role is: " << peer_role
+              << ", addr is: " << peer_endpoints[0];
+      if (switch_s_instance_ == nullptr) {
+        switch_s_instance_.reset(new HeterClient());
+        switch_s_instance_->SetPeerSwitchList(peer_endpoints);
+        switch_s_instance_->InitClientChannels(false, peer_endpoints,
+                                               peer_role);
+      }
     }
-    VLOG(4) << "peer role is: " << peer_role
-            << ", addr is: " << peer_endpoints[0];
-    switch_s_instance_.SetPeerSwitchList(peer_endpoints);
-    switch_s_instance_.InitClientChannels(false, peer_endpoints, peer_role);
     return switch_s_instance_;
   }
 
@@ -230,6 +236,8 @@ class HeterClient {
   HeterClient(const HeterClient&);
 
   static std::shared_ptr<HeterClient> s_instance_;
+  static std::mutex mtx_;
+  static std::shared_ptr<HeterClient> switch_s_instance_;
   std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
   std::vector<std::shared_ptr<brpc::Channel>> previous_xpu_channels_;
 
diff --git a/paddle/fluid/distributed/ps/service/heter_server.h b/paddle/fluid/distributed/ps/service/heter_server.h
old mode 100644
new mode 100755
index a65470cdbad5c..ddcf36bf68d7b
--- a/paddle/fluid/distributed/ps/service/heter_server.h
+++ b/paddle/fluid/distributed/ps/service/heter_server.h
@@ -144,31 +144,41 @@ class SendAndRecvVariableHandler final : public ServiceHandlerBase {
                             brpc::Controller* cntl);
 
   void WaitForVarsConsumed(int32_t group_id, const std::string& var_name) {
-    timeline_.Start();
+    // timeline_.Start();
     while (true) {
-      if (vars_ready_flag[group_id][var_name] == 0) {
-        break;
+      {
+        std::lock_guard<std::mutex> lock(scope_mutex_);
+        if (vars_ready_flag[group_id][var_name] == 0) {
+          break;
+        }
       }
+      /*
       timeline_.Pause();
       if (timeline_.ElapsedSec() > FLAGS_switch_send_recv_timeout_s) {
         VLOG(0) << "vars not consumed exceed 10 miniutes";
         break;
       }
+      */
     }
     return;
   }
 
   void WaitForVarsProduced(int32_t group_id, const std::string& var_name) {
-    timeline_.Start();
+    // timeline_.Start();
     while (true) {
-      if (vars_ready_flag[group_id][var_name] == 1) {
-        break;
+      {
+        std::lock_guard<std::mutex> lock(scope_mutex_);
+        if (vars_ready_flag[group_id][var_name] == 1) {
+          break;
+        }
       }
+      /*
       timeline_.Pause();
       if (timeline_.ElapsedSec() > FLAGS_switch_send_recv_timeout_s) {
         VLOG(0) << "vars not produced exceed 10 miniutes";
         break;
       }
+      */
     }
     return;
   }
@@ -379,12 +389,12 @@ class HeterService : public PsService {
                             ::google::protobuf::Closure* done) {
     VLOG(4) << "entering SendToSwitch";
     brpc::ClosureGuard done_guard(done);
-    auto& switch_client_ptr_ =
+    std::shared_ptr<HeterClient> switch_client_ptr_ =
         HeterClient::GetSwitchInstance(peer_endpoints_, PEER_ROLE_IS_SWITCH);
-    if (switch_client_ptr_.peer_switch_channels_.empty()) {
-      LOG(ERROR) << "switch_client_ptr_.peer_switch_channels_ null";
+    if (switch_client_ptr_->peer_switch_channels_.empty()) {
+      LOG(ERROR) << "switch_client_ptr_->peer_switch_channels_ null";
     }
-    brpc::Channel* channel = switch_client_ptr_.peer_switch_channels_[0].get();
+    brpc::Channel* channel = switch_client_ptr_->peer_switch_channels_[0].get();
     brpc::Controller* cntl = static_cast<brpc::Controller*>(controller);
     // proxy: 定义新的 OnHeterRpcDone 对象（或者在类 OnHeterRpcDone 中 reset）
     OnHeterRpcDone* closure2 = new OnHeterRpcDone([](void* done) {
@@ -414,6 +424,7 @@ class HeterService : public PsService {
         std_cntl.response_attachment().movable());
     fut.wait();
     VLOG(4) << "SendToSwitch done";
+    delete closure2;
   }
 
   void SendS2S(::google::protobuf::RpcController* controller,
@@ -446,11 +457,11 @@ class HeterService : public PsService {
     brpc::ClosureGuard done_guard(done);
     brpc::Controller* cntl = static_cast<brpc::Controller*>(controller);
     VLOG(4) << "SendToWorker(client addr) =" << cntl->remote_side();
-    auto& switch_client_ptr_ =
+    std::shared_ptr<distributed::HeterClient> switch_client_ptr_ =
         HeterClient::GetSwitchInstance(peer_endpoints_, PEER_ROLE_IS_WORKER);
     VLOG(4) << "in switch client, peer worker 0: "
-            << switch_client_ptr_.peer_worker_list_[0];
-    brpc::Channel* channel = switch_client_ptr_.peer_worker_channels_[0].get();
+            << switch_client_ptr_->peer_worker_list_[0];
+    brpc::Channel* channel = switch_client_ptr_->peer_worker_channels_[0].get();
 
     auto* closure = reinterpret_cast<OnHeterRpcDone*>(done);
     PsService_Stub stub(channel);
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index a9cd0021c8578..a3fa80b3865e4 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -28,6 +28,22 @@ namespace paddle {
 namespace distributed {
 
 #ifdef PADDLE_WITH_HETERPS
+int32_t GraphTable::Load_to_ssd(const std::string &path,
+                                const std::string &param) {
+  bool load_edge = (param[0] == 'e');
+  bool load_node = (param[0] == 'n');
+  if (load_edge) {
+    bool reverse_edge = (param[1] == '<');
+    std::string edge_type = param.substr(2);
+    return this->load_edges_to_ssd(path, reverse_edge, edge_type);
+  }
+  if (load_node) {
+    std::string node_type = param.substr(1);
+    return this->load_nodes(path, node_type);
+  }
+  return 0;
+}
+
 paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
     int idx, std::vector<int64_t> ids) {
   std::vector<std::vector<int64_t>> bags(task_pool_size_);
@@ -38,11 +54,11 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   std::vector<std::future<int>> tasks;
   std::vector<int64_t> edge_array[task_pool_size_];
   std::vector<paddle::framework::GpuPsGraphNode> node_array[task_pool_size_];
-  for (int i = 0; i < (int)bags.size(); i++) {
+  for (size_t i = 0; i < bags.size(); i++) {
     if (bags[i].size() > 0) {
       tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int {
         paddle::framework::GpuPsGraphNode x;
-        for (int j = 0; j < (int)bags[i].size(); j++) {
+        for (size_t j = 0; j < bags[i].size(); j++) {
           Node *v = find_node(0, idx, bags[i][j]);
           x.node_id = bags[i][j];
           if (v == NULL) {
@@ -53,7 +69,7 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
             x.neighbor_size = v->get_neighbor_size();
             x.neighbor_offset = edge_array[i].size();
             node_array[i].push_back(x);
-            for (int k = 0; k < x.neighbor_size; k++) {
+            for (size_t k = 0; k < x.neighbor_size; k++) {
               edge_array[i].push_back(v->get_neighbor_id(k));
             }
           }
@@ -64,27 +80,29 @@ paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
   }
   for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
   paddle::framework::GpuPsCommGraph res;
-  int tot_len = 0;
+  unsigned int tot_len = 0;
   for (int i = 0; i < task_pool_size_; i++) {
-    tot_len += (int)edge_array[i].size();
-  }
-  res.neighbor_size = tot_len;
-  res.node_size = ids.size();
-  res.neighbor_list = new int64_t[tot_len];
-  res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
-  int offset = 0, ind = 0;
+    tot_len += edge_array[i].size();
+  }
+  // res.neighbor_size = tot_len;
+  // res.node_size = ids.size();
+  // res.neighbor_list = new int64_t[tot_len];
+  // res.node_list = new paddle::framework::GpuPsGraphNode[ids.size()];
+  res.init_on_cpu(tot_len, (unsigned int)ids.size());
+  unsigned int offset = 0, ind = 0;
   for (int i = 0; i < task_pool_size_; i++) {
     for (int j = 0; j < (int)node_array[i].size(); j++) {
       res.node_list[ind] = node_array[i][j];
       res.node_list[ind++].neighbor_offset += offset;
     }
-    for (int j = 0; j < (int)edge_array[i].size(); j++) {
+    for (size_t j = 0; j < edge_array[i].size(); j++) {
       res.neighbor_list[offset + j] = edge_array[i][j];
     }
     offset += edge_array[i].size();
   }
   return res;
 }
+
 int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
                                     char *data, int len) {
   if (_db != NULL) {
@@ -92,8 +110,31 @@ int32_t GraphTable::add_node_to_ssd(int type_id, int idx, int64_t src_id,
     memcpy(ch, &type_id, sizeof(int));
     memcpy(ch + sizeof(int), &idx, sizeof(int));
     memcpy(ch + sizeof(int) * 2, &src_id, sizeof(int64_t));
-    _db->put(src_id % shard_num % task_pool_size_, ch,
-             sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
+    std::string str;
+    if (_db->get(src_id % shard_num % task_pool_size_, ch,
+                 sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
+      int64_t *stored_data = ((int64_t *)str.c_str());
+      int n = str.size() / sizeof(int64_t);
+      char *new_data = new char[n * sizeof(int64_t) + len];
+      memcpy(new_data, stored_data, n * sizeof(int64_t));
+      memcpy(new_data + n * sizeof(int64_t), data, len);
+      _db->put(src_id % shard_num % task_pool_size_, ch,
+               sizeof(int) * 2 + sizeof(int64_t), (char *)new_data,
+               n * sizeof(int64_t) + len);
+      delete[] new_data;
+    } else {
+      _db->put(src_id % shard_num % task_pool_size_, ch,
+               sizeof(int) * 2 + sizeof(int64_t), (char *)data, len);
+    }
+    _db->flush(src_id % shard_num % task_pool_size_);
+    std::string x;
+    // if (_db->get(src_id % shard_num % task_pool_size_, ch, sizeof(int64_t) +
+    // 2 * sizeof(int), x) ==0){
+    // VLOG(0)<<"put result";
+    // for(int i = 0;i < x.size();i+=8){
+    //   VLOG(0)<<"get an id "<<*((int64_t *)(x.c_str() + i));
+    // }
+    //}
   }
   return 0;
 }
@@ -109,8 +150,8 @@ char *GraphTable::random_sample_neighbor_from_ssd(
   memset(ch, 0, sizeof(int));
   memcpy(ch + sizeof(int), &idx, sizeof(int));
   memcpy(ch + sizeof(int) * 2, &id, sizeof(int64_t));
-  if (_db->get(id % shard_num % task_pool_size_, ch, sizeof(uint64_t), str) ==
-      0) {
+  if (_db->get(id % shard_num % task_pool_size_, ch,
+               sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
     int64_t *data = ((int64_t *)str.c_str());
     int n = str.size() / sizeof(int64_t);
     std::unordered_map<int, int> m;
@@ -142,7 +183,298 @@ char *GraphTable::random_sample_neighbor_from_ssd(
   actual_size = 0;
   return NULL;
 }
+
+int64_t GraphTable::load_graph_to_memory_from_ssd(int idx,
+                                                  std::vector<int64_t> &ids) {
+  std::vector<std::vector<int64_t>> bags(task_pool_size_);
+  for (auto x : ids) {
+    int location = x % shard_num % task_pool_size_;
+    bags[location].push_back(x);
+  }
+  std::vector<std::future<int>> tasks;
+  std::vector<int64_t> count(task_pool_size_, 0);
+  for (size_t i = 0; i < bags.size(); i++) {
+    if (bags[i].size() > 0) {
+      tasks.push_back(_shards_task_pool[i]->enqueue([&, i, idx, this]() -> int {
+
+        char ch[sizeof(int) * 2 + sizeof(int64_t)];
+        memset(ch, 0, sizeof(int));
+        memcpy(ch + sizeof(int), &idx, sizeof(int));
+        for (size_t k = 0; k < bags[i].size(); k++) {
+          auto v = bags[i][k];
+          memcpy(ch + sizeof(int) * 2, &v, sizeof(int64_t));
+          std::string str;
+          if (_db->get(i, ch, sizeof(int) * 2 + sizeof(int64_t), str) == 0) {
+            count[i] += (int64_t)str.size();
+            for (int j = 0; j < str.size(); j += sizeof(int64_t)) {
+              int64_t id = *(int64_t *)(str.c_str() + j);
+              add_comm_edge(idx, v, id);
+            }
+          }
+        }
+        return 0;
+      }));
+    }
+  }
+
+  for (int i = 0; i < (int)tasks.size(); i++) tasks[i].get();
+  int64_t tot = 0;
+  for (auto x : count) tot += x;
+  return tot;
+}
+
+void GraphTable::make_partitions(int idx, int64_t byte_size, int device_len) {
+  VLOG(2) << "start to make graph partitions , byte_size = " << byte_size
+          << " total memory cost = " << total_memory_cost;
+  if (total_memory_cost == 0) {
+    VLOG(0) << "no edges are detected,make partitions exits";
+    return;
+  }
+  const float a = 2.0, y = 1.25;
+  int64_t gb_size_by_discount = byte_size * 0.8 * device_len;
+  if (gb_size_by_discount <= 0) gb_size_by_discount = 1;
+  int part_len = total_memory_cost / gb_size_by_discount;
+  if (part_len == 0) part_len = 1;
+
+  VLOG(2) << "part_len = " << part_len
+          << " byte size = " << gb_size_by_discount;
+  partitions[idx].clear();
+  partitions[idx].resize(part_len);
+  std::vector<int64_t> memory_remaining(part_len, gb_size_by_discount);
+  std::vector<float> score(part_len, 0);
+  std::unordered_map<int64_t, int> id_map;
+  std::vector<rocksdb::Iterator *> iters;
+  for (int i = 0; i < task_pool_size_; i++) {
+    iters.push_back(_db->get_iterator(i));
+    iters[i]->SeekToFirst();
+  }
+  int next = 0;
+  while (iters.size()) {
+    if (next >= iters.size()) {
+      next = 0;
+    }
+    if (!iters[next]->Valid()) {
+      iters.erase(iters.begin() + next);
+      continue;
+    }
+    std::string key = iters[next]->key().ToString();
+    int temp_idx = *(int *)(key.c_str() + sizeof(int));
+    if (temp_idx != idx) {
+      iters[next]->Next();
+      next++;
+      continue;
+    }
+    std::string value = iters[next]->value().ToString();
+    std::int64_t i_key = *(int64_t *)(key.c_str() + 8);
+    for (int i = 0; i < part_len; i++) {
+      if (memory_remaining[i] < (int64_t)value.size()) {
+        score[i] = -100000.0;
+      } else {
+        score[i] = 0;
+      }
+    }
+    for (int j = 0; j < value.size(); j += sizeof(int64_t)) {
+      int64_t v = *((int64_t *)(value.c_str() + j));
+      int index = -1;
+      if (id_map.find(v) != id_map.end()) {
+        index = id_map[v];
+        score[index]++;
+      }
+    }
+    float base;
+    int index = 0;
+    for (int i = 0; i < part_len; i++) {
+      base = gb_size_by_discount - memory_remaining[i];
+      score[i] -= a * y * std::pow(1.0 * base, y - 1);
+      if (score[i] > score[index]) index = i;
+      VLOG(2) << "score" << i << " = " << score[i] << " memory left "
+              << memory_remaining[i];
+    }
+    id_map[i_key] = index;
+    partitions[idx][index].push_back(i_key);
+    memory_remaining[index] -= (int64_t)value.size();
+    iters[next]->Next();
+    next++;
+  }
+  for (int i = 0; i < part_len; i++) {
+    if (partitions[idx][i].size() == 0) {
+      partitions[idx].erase(partitions[idx].begin() + i);
+      i--;
+      part_len--;
+      continue;
+    }
+    VLOG(2) << " partition " << i << " size = " << partitions[idx][i].size();
+    for (auto x : partitions[idx][i]) {
+      VLOG(2) << "find a id " << x;
+    }
+  }
+  next_partition = 0;
+}
+
+void GraphTable::clear_graph(int idx) {
+  for (auto p : edge_shards[idx]) {
+    delete p;
+  }
+
+  edge_shards[idx].clear();
+  for (size_t i = 0; i < shard_num_per_server; i++) {
+    edge_shards[idx].push_back(new GraphShard());
+  }
+}
+int32_t GraphTable::load_next_partition(int idx) {
+  if (next_partition >= partitions[idx].size()) {
+    VLOG(0) << "partition iteration is done";
+    return -1;
+  }
+  clear_graph(idx);
+  load_graph_to_memory_from_ssd(idx, partitions[idx][next_partition]);
+  next_partition++;
+  return 0;
+}
+int32_t GraphTable::load_edges_to_ssd(const std::string &path,
+                                      bool reverse_edge,
+                                      const std::string &edge_type) {
+  int idx = 0;
+  if (edge_type == "") {
+    VLOG(0) << "edge_type not specified, loading edges to " << id_to_edge[0]
+            << " part";
+  } else {
+    if (edge_to_id.find(edge_type) == edge_to_id.end()) {
+      VLOG(0) << "edge_type " << edge_type
+              << " is not defined, nothing will be loaded";
+      return 0;
+    }
+    idx = edge_to_id[edge_type];
+  }
+  total_memory_cost = 0;
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  int valid_count = 0;
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      VLOG(0) << "get a line from file " << line;
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoll(values[0]);
+      auto dist_ids = paddle::string::split_string<std::string>(values[1], ";");
+      std::vector<int64_t> dist_data;
+      for (auto x : dist_ids) {
+        dist_data.push_back(std::stoll(x));
+        total_memory_cost += sizeof(int64_t);
+      }
+      add_node_to_ssd(0, idx, src_id, (char *)dist_data.data(),
+                      (int)(dist_data.size() * sizeof(int64_t)));
+    }
+  }
+  VLOG(0) << "total memory cost = " << total_memory_cost << " bytes";
+  return 0;
+}
+
+int32_t GraphTable::dump_edges_to_ssd(int idx) {
+  VLOG(0) << "calling dump edges to ssd";
+  const int64_t fixed_size = 10000;
+  // std::vector<int64_t> edge_array[task_pool_size_];
+  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
+  std::vector<std::future<int64_t>> tasks;
+  auto &shards = edge_shards[idx];
+  for (size_t i = 0; i < shards.size(); ++i) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&, i, this]() -> int64_t {
+          int64_t cost = 0;
+          std::vector<Node *> &v = shards[i]->get_bucket();
+          std::vector<int64_t> s;
+          size_t ind = i % this->task_pool_size_;
+          for (size_t j = 0; j < v.size(); j++) {
+            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+              s.push_back(v[j]->get_neighbor_id(k));
+            }
+            cost += v[j]->get_neighbor_size() * sizeof(int64_t);
+            add_node_to_ssd(0, idx, v[j]->get_id(), (char *)s.data(),
+                            s.size() * sizeof(int64_t));
+          }
+          return cost;
+        }));
+  }
+  for (size_t i = 0; i < tasks.size(); i++) total_memory_cost += tasks[i].get();
+  return 0;
+}
+int32_t GraphTable::make_complementary_graph(int idx, int64_t byte_size) {
+  VLOG(0) << "make_complementary_graph";
+  const int64_t fixed_size = 10000;
+  // std::vector<int64_t> edge_array[task_pool_size_];
+  std::vector<std::unordered_map<int64_t, int>> count(task_pool_size_);
+  std::vector<std::future<int>> tasks;
+  auto &shards = edge_shards[idx];
+  for (size_t i = 0; i < shards.size(); ++i) {
+    tasks.push_back(
+        _shards_task_pool[i % task_pool_size_]->enqueue([&, i, this]() -> int {
+          std::vector<Node *> &v = shards[i]->get_bucket();
+          size_t ind = i % this->task_pool_size_;
+          for (size_t j = 0; j < v.size(); j++) {
+            size_t location = v[j]->get_id();
+            for (int k = 0; k < v[j]->get_neighbor_size(); k++) {
+              count[ind][v[j]->get_neighbor_id(k)]++;
+            }
+          }
+          return 0;
+        }));
+  }
+
+  std::unordered_map<int64_t, int> final_count;
+  std::map<int, std::vector<int64_t>> count_to_id;
+  std::vector<int64_t> buffer;
+  for (auto p : edge_shards[idx]) {
+    delete p;
+  }
+
+  edge_shards[idx].clear();
+  for (size_t i = 0; i < shard_num_per_server; i++) {
+    edge_shards[idx].push_back(new GraphShard());
+  }
+  for (size_t i = 0; i < tasks.size(); i++) tasks[i].get();
+  for (int i = 0; i < task_pool_size_; i++) {
+    for (auto &p : count[i]) {
+      final_count[p.first] = final_count[p.first] + p.second;
+    }
+    count[i].clear();
+  }
+  for (auto &p : final_count) {
+    count_to_id[p.second].push_back(p.first);
+    VLOG(2) << p.first << " appear " << p.second << " times";
+  }
+  // std::map<int,std::vector<int64_t>>::iterator iter= count_to_id.rbegin();
+  auto iter = count_to_id.rbegin();
+  while (iter != count_to_id.rend() && byte_size > 0) {
+    for (auto x : iter->second) {
+      buffer.push_back(x);
+      if (buffer.size() >= fixed_size) {
+        int64_t res = load_graph_to_memory_from_ssd(idx, buffer);
+        byte_size -= res;
+      }
+      if (byte_size <= 0) break;
+    }
+    iter++;
+  }
+  if (byte_size > 0 && buffer.size() > 0) {
+    int64_t res = load_graph_to_memory_from_ssd(idx, buffer);
+    byte_size -= res;
+  }
+  std::string sample_type = "random";
+  for (auto &shard : edge_shards[idx]) {
+    auto bucket = shard->get_bucket();
+    for (size_t i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  return 0;
+}
 #endif
+
 /*
 int CompleteGraphSampler::run_graph_sampling() {
   pthread_rwlock_t *rw_lock = graph_table->rw_lock.get();
@@ -700,9 +1032,11 @@ int32_t GraphTable::build_sampler(int idx, std::string sample_type) {
 }
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
                                const std::string &edge_type) {
-  // #ifdef PADDLE_WITH_HETERPS
-  //   if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
-  // #endif
+#ifdef PADDLE_WITH_HETERPS
+  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+  if (search_level == 2) total_memory_cost = 0;
+  const int64_t fixed_load_edges = 1000000;
+#endif
   int idx = 0;
   if (edge_type == "") {
     VLOG(0) << "edge_type not specified, loading edges to " << id_to_edge[0]
@@ -715,6 +1049,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
     }
     idx = edge_to_id[edge_type];
   }
+
   auto paths = paddle::string::split_string<std::string>(path, ";");
   int64_t count = 0;
   std::string sample_type = "random";
@@ -756,13 +1091,33 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge,
       edge_shards[idx][index]->add_graph_node(src_id)->build_edges(is_weighted);
       edge_shards[idx][index]->add_neighbor(src_id, dst_id, weight);
       valid_count++;
+#ifdef PADDLE_WITH_HETERPS
+      // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+      if (count > fixed_load_edges && search_level == 2) {
+        dump_edges_to_ssd(idx);
+        VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
+        clear_graph(idx);
+        count = 0;
+      }
+#endif
     }
   }
   VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
           << path;
 
-  // Build Sampler j
-
+// Build Sampler j
+#ifdef PADDLE_WITH_HETERPS
+  // if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get());
+  if (search_level == 2) {
+    if (count > 0) {
+      dump_edges_to_ssd(idx);
+      VLOG(0) << "dumping edges to ssd, edge count is reset to 0";
+      clear_graph(idx);
+      count = 0;
+    }
+    return 0;
+  }
+#endif
   for (auto &shard : edge_shards[idx]) {
     auto bucket = shard->get_bucket();
     for (size_t i = 0; i < bucket.size(); i++) {
@@ -892,7 +1247,6 @@ int32_t GraphTable::random_sample_neighbors(
             scaled_lru->query(i, id_list[i].data(), id_list[i].size(), r);
       }
       int index = 0;
-      uint32_t idx;
       std::vector<SampleResult> sample_res;
       std::vector<SampleKey> sample_keys;
       auto &rng = _shards_task_rng_pool[i];
@@ -911,6 +1265,7 @@ int32_t GraphTable::random_sample_neighbors(
           if (node == nullptr) {
 #ifdef PADDLE_WITH_HETERPS
             if (search_level == 2) {
+              VLOG(2) << "enter sample from ssd";
               char *buffer_addr = random_sample_neighbor_from_ssd(
                   idx, node_id, sample_size, rng, actual_size);
               if (actual_size != 0) {
@@ -1060,6 +1415,26 @@ std::pair<int32_t, std::string> GraphTable::parse_feature(
   return std::make_pair<int32_t, std::string>(-1, "");
 }
 
+std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
+                                                         int slice_num) {
+  std::vector<std::vector<int64_t>> res(slice_num);
+  auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
+  std::vector<std::future<std::vector<int64_t>>> tasks;
+  for (int i = 0; i < search_shards.size(); i++) {
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [&search_shards, i]() -> std::vector<int64_t> {
+          return search_shards[i]->get_all_id();
+        }));
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  for (size_t i = 0; i < tasks.size(); i++) {
+    auto ids = tasks[i].get();
+    for (auto &id : ids) res[id % slice_num].push_back(id);
+  }
+  return res;
+}
 int32_t GraphTable::pull_graph_list(int type_id, int idx, int start,
                                     int total_size,
                                     std::unique_ptr<char[]> &buffer,
@@ -1218,6 +1593,9 @@ int32_t GraphTable::Initialize(const GraphParameter &graph) {
   VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
           << shard_start << " shard_end " << shard_end;
   edge_shards.resize(id_to_edge.size());
+#ifdef PADDLE_WITH_HETERPS
+  partitions.resize(id_to_edge.size());
+#endif
   for (int k = 0; k < (int)edge_shards.size(); k++) {
     for (size_t i = 0; i < shard_num_per_server; i++) {
       edge_shards[k].push_back(new GraphShard());
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h
index 059bcb09a0a6e..2d869dc805a94 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.h
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.h
@@ -63,7 +63,13 @@ class GraphShard {
     }
     return res;
   }
-
+  std::vector<int64_t> get_all_id() {
+    std::vector<int64_t> res;
+    for (int i = 0; i < (int)bucket.size(); i++) {
+      res.push_back(bucket[i]->get_id());
+    }
+    return res;
+  }
   GraphNode *add_graph_node(int64_t id);
   GraphNode *add_graph_node(Node *node);
   FeatureNode *add_feature_node(int64_t id);
@@ -420,6 +426,10 @@ class GraphTable : public Table {
     use_cache = false;
     shard_num = 0;
     rw_lock.reset(new pthread_rwlock_t());
+#ifdef PADDLE_WITH_HETERPS
+    next_partition = 0;
+    total_memory_cost = 0;
+#endif
   }
   virtual ~GraphTable();
 
@@ -465,6 +475,8 @@ class GraphTable : public Table {
   int32_t load_edges(const std::string &path, bool reverse,
                      const std::string &edge_type);
 
+  std::vector<std::vector<int64_t>> get_all_id(int type, int idx,
+                                               int slice_num);
   int32_t load_nodes(const std::string &path, std::string node_type);
 
   int32_t add_graph_node(int idx, std::vector<int64_t> &id_list,
@@ -513,7 +525,7 @@ class GraphTable : public Table {
       const std::vector<std::vector<std::string>> &res);
 
   size_t get_server_num() { return server_num; }
-
+  void clear_graph(int idx);
   virtual int32_t make_neighbor_sample_cache(size_t size_limit, size_t ttl) {
     {
       std::unique_lock<std::mutex> lock(mutex_);
@@ -538,6 +550,7 @@ class GraphTable : public Table {
   //   graph_sampler->set_graph_sample_callback(callback);
   //   return 0;
   // }
+  virtual void make_partitions(int idx, int64_t gb_size, int device_len);
   virtual char *random_sample_neighbor_from_ssd(
       int idx, int64_t id, int sample_size,
       const std::shared_ptr<std::mt19937_64> rng, int &actual_size);
@@ -545,8 +558,25 @@ class GraphTable : public Table {
                                   char *data, int len);
   virtual paddle::framework::GpuPsCommGraph make_gpu_ps_graph(
       int idx, std::vector<int64_t> ids);
+  int32_t Load_to_ssd(const std::string &path, const std::string &param);
+  int64_t load_graph_to_memory_from_ssd(int idx, std::vector<int64_t> &ids);
+  int32_t make_complementary_graph(int idx, int64_t byte_size);
+  int32_t dump_edges_to_ssd(int idx);
+  int32_t get_partition_num(int idx) { return partitions[idx].size(); }
+  std::vector<int64_t> get_partition(int idx, int index) {
+    if (idx >= partitions.size() || index >= partitions[idx].size())
+      return std::vector<int64_t>();
+    return partitions[idx][index];
+  }
+  int32_t load_edges_to_ssd(const std::string &path, bool reverse_edge,
+                            const std::string &edge_type);
+  int32_t load_next_partition(int idx);
+  void set_search_level(int search_level) { this->search_level = search_level; }
   // virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); }
   int search_level;
+  int64_t total_memory_cost;
+  std::vector<std::vector<std::vector<int64_t>>> partitions;
+  int next_partition;
 #endif
   virtual int32_t add_comm_edge(int idx, int64_t src_id, int64_t dst_id);
   virtual int32_t build_sampler(int idx, std::string sample_type = "random");
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.h b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
index ec86239ffb161..6516c75a5d696 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.h
@@ -94,9 +94,9 @@ class MemorySparseTable : public Table {
 
  protected:
   const int _task_pool_size = 24;
-  size_t _avg_local_shard_num;
-  size_t _real_local_shard_num;
-  size_t _sparse_table_shard_num;
+  int _avg_local_shard_num;
+  int _real_local_shard_num;
+  int _sparse_table_shard_num;
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
   std::unique_ptr<shard_type[]> _local_shards;
 };
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 802c28d7d374e..08e8f2baef6a0 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -38,10 +38,13 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
   }
 }
 
-std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph, bool is_new_grad) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+GradNodeAccumulation::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,  // NOLINT
+    bool create_graph,
+    bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
@@ -56,8 +59,9 @@ operator()(
   // Apply Gradient Hooks
   paddle::experimental::Tensor grad_out;
   if (GradientHooksRegistered()) {
-    std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-        ApplyGradientHooks(grads);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>
+        hooked_grads = ApplyGradientHooks(grads);
     grad_out = hooked_grads[0][0];
   } else {
     grad_out = grads[0][0];
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index dbf518252e084..f37de9c8e88f1 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -37,9 +37,12 @@ class GradNodeAccumulation : public GradNodeBase {
   }
 
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false, bool is_new_grad = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h
index 95313bde02a20..2145f4a11965c 100644
--- a/paddle/fluid/eager/amp_utils.h
+++ b/paddle/fluid/eager/amp_utils.h
@@ -21,8 +21,8 @@ namespace egr {
 
 static inline paddle::experimental::DataType GetPromoteType(
     const std::string& op_name,
-    const std::vector<std::vector<paddle::experimental::Tensor>>&
-        amp_tensors_vector,
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& amp_tensors_vector,
     const paddle::experimental::DataType& amp_dtype) {
   auto dst_type = amp_dtype;
   if (egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype() ==
@@ -86,8 +86,8 @@ static inline paddle::experimental::DataType GetPromoteType(
 
 inline paddle::experimental::DataType GetAmpDestDtype(
     const std::string& op_name,
-    const std::vector<std::vector<paddle::experimental::Tensor>>&
-        amp_tensors_vector) {
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& amp_tensors_vector) {
   auto amp_dtype =
       egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype();
   auto amp_level = egr::Controller::Instance().GetAMPLevel();
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 18678b774cbd2..8bd40140f53cc 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -144,11 +144,15 @@ void GradNodeScale::SetTensorWrappers_X(
 
 void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
 
-std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph, bool is_new_grad) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+GradNodeScale::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,  // NOLINT
+    bool create_graph,
+    bool is_new_grad) {
   // 1. Check Output Size
+  VLOG(6) << "grad size is: " << grads.size();
   PADDLE_ENFORCE(
       ((grads.size() == 1) && (grads[0].size() == 1)),
       paddle::platform::errors::Fatal(
@@ -156,15 +160,18 @@ operator()(
           "However received: %d",
           "This indicates an issue with Eager Dygraph Backward logic",
           grads.size()));
-  std::vector<std::vector<paddle::experimental::Tensor>> outs;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs;
   // 2. Create needed out parttern
   paddle::experimental::Tensor out;
   // Apply Gradient Hooks
   if (GradientHooksRegistered()) {
     // TODO(jiabin): Shall we apply hook slot by slot here or accept
     // vector<vector<phi::tensor>> to apply all hooks?
-    std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-        ApplyGradientHooks(grads);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>
+        hooked_grads = ApplyGradientHooks(grads);
     ScaleAPI(/* slot by slot set */ hooked_grads[0][0], scale_, 0.0 /* bias */,
              true /* bias_after_scale */, &out);
   } else {
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index cd4c0c5ac682d..04ff510944dd2 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -38,9 +38,12 @@ class GradNodeScale : public GradNodeBase {
   ~GradNodeScale() override = default;
 
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false, bool is_new_grad = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
@@ -48,7 +51,7 @@ class GradNodeScale : public GradNodeBase {
       const std::vector<paddle::experimental::Tensor>& tensors);
 
   void SetAttributes_scale(float scale);
-  std::string name() override { return ""; }
+  std::string name() override { return "scale node"; }
   // Members: define fwd input tensors
   // For Scale there is no fwd input tensor needed
 
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index 1be3b31de00a6..7a374d567d5d0 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -79,9 +79,6 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x,
     // Pass Attributes to GradNode
     scale_node->SetAttributes_scale(scale);
 
-    // Set Next Edges
-    scale_node->AddEdges(p_autograd_in, /*slot id*/ 0);
-
     // Set TensorWrappers
     scale_node->SetTensorWrappers_X({x});
 
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index adfcab961bfe5..44e78c3bbf193 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -19,8 +19,9 @@
 #include <memory>
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/utils/small_vector.h"
 namespace egr {
-
+constexpr size_t kSlotSmallVectorSize = 15U;
 class UniqueNameGenerator {
  public:
   explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 39559a2d901f6..44fa8461f2fe9 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -56,6 +56,13 @@ static std::unordered_set<std::string> black_ops_list = {"run_program"};
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
   std::replace(ret.begin(), ret.end(), '-', '_');  // replace all '-' to '_'
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
+  return ret;
+}
+
+static std::string LegalizeVarName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
   return ret;
 }
 
@@ -1024,7 +1031,8 @@ static std::string GenerateGradNodeCreationContent(
   // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
+    const std::string& output_autograd_name =
+        "p_autograd_" + LegalizeVarName(output_name);
 
     // output autograd_meta should be got after running TraceOP.
     if (output.duplicable()) {
@@ -1032,12 +1040,13 @@ static std::string GenerateGradNodeCreationContent(
           "    std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::autograd_meta(&%s);\n";
       get_output_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name,
+          LegalizeVarName(output_name));
     } else {
       // In inplace op, the case where output is duplicable is not considered.
       // Replace output directly with input in inplace op.
       if (!inplace_map.empty() && inplace_map.count(output_name)) {
-        auto inplace_input_name = inplace_map[output_name];
+        auto inplace_input_name = LegalizeVarName(inplace_map[output_name]);
         const std::string& inplace_input_autograd_name =
             "p_autograd_" + inplace_input_name;
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
@@ -1049,9 +1058,9 @@ static std::string GenerateGradNodeCreationContent(
         const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
             "    egr::AutogradMeta* %s = "
             "egr::EagerUtils::autograd_meta(&%s);\n";
-        get_output_autograd_meta_str +=
-            paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE,
-                                    output_autograd_name, output_name);
+        get_output_autograd_meta_str += paddle::string::Sprintf(
+            GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name,
+            LegalizeVarName(output_name));
       }
     }
   }
@@ -1061,28 +1070,32 @@ static std::string GenerateGradNodeCreationContent(
   // inplace).
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
-    const std::string& input_autograd_name = "p_autograd_" + input_name;
+    const std::string& input_autograd_name =
+        "p_autograd_" + LegalizeVarName(input_name);
 
     if (input.duplicable()) {
       const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
           "  std::vector<egr::AutogradMeta*> %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_input_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name,
+          LegalizeVarName(input_name));
 
     } else if (input.dispensable()) {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_input_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name,
+          LegalizeVarName(input_name));
 
     } else {
       const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
           "  egr::AutogradMeta* %s = "
           "egr::EagerUtils::nullable_autograd_meta(%s);\n";
       get_input_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name);
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name,
+          LegalizeVarName(input_name));
     }
   }
   VLOG(6) << "Generated inputs autograd_meta";
@@ -1096,7 +1109,7 @@ static std::string GenerateGradNodeCreationContent(
         "  egr::EagerUtils::CheckInplace(%s, p_autograd_%s, "
         "require_any_grad);\n";
     for (auto& inplace_pair : inplace_map) {
-      std::string inplace_name = inplace_pair.second;
+      std::string inplace_name = LegalizeVarName(inplace_pair.second);
       check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE,
                                                    inplace_name, inplace_name);
     }
@@ -1159,12 +1172,12 @@ static std::string GenerateGradNodeCreationContent(
       if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
         auto inplace_input_name = inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-            inplace_input_name, full_reserved);
+            SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
+            LegalizeVarName(inplace_input_name), full_reserved);
       } else {
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved);
+            SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
+            LegalizeVarName(tensor_wrapper_name), full_reserved);
       }
     }
   }
@@ -1176,7 +1189,8 @@ static std::string GenerateGradNodeCreationContent(
   std::string compute_require_grad_args = "trace_backward";
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
-    const std::string& input_autograd_name = "p_autograd_" + input_name;
+    const std::string& input_autograd_name =
+        "p_autograd_" + LegalizeVarName(input_name);
 
     if (!input.duplicable()) {
       compute_require_grad_args += ", " + input_autograd_name;
@@ -1184,26 +1198,19 @@ static std::string GenerateGradNodeCreationContent(
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
           "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
-
-      const char* ADD_EDGES_TEMPLATE =
-          "      if(%s) grad_node->AddEdges(%s, %d);\n";
       grad_node_creation_str +=
-          paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name,
-                                  input_autograd_name, input_position);
+          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
+                                  LegalizeVarName(input_name), input_position);
+
     } else {
       compute_require_grad_args += ", &" + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
 
       const char* SET_GRAD_OUT_META_TEMPLATE =
           "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_OUT_META_TEMPLATE, input_name, input_position);
-
-      const char* ADD_EDGES_TEMPLATE = "      grad_node->AddEdges(&%s, %d);\n";
-      grad_node_creation_str += paddle::string::Sprintf(
-          ADD_EDGES_TEMPLATE, input_autograd_name, input_position);
+      grad_node_creation_str +=
+          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
+                                  LegalizeVarName(input_name), input_position);
     }
   }
 
@@ -1217,7 +1224,7 @@ static std::string GenerateGradNodeCreationContent(
     if (!inplace_map.empty() && inplace_map.count(output_name)) {
       auto inplace_input_name = inplace_map[output_name];
       const std::string& inplace_input_autograd_name =
-          "p_autograd_" + inplace_input_name;
+          "p_autograd_" + LegalizeVarName(inplace_input_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
       // Intermediate Tensor does not require SetHistory, nor RetainGrad
@@ -1237,18 +1244,20 @@ static std::string GenerateGradNodeCreationContent(
       const char* SET_GRAD_IN_META_TEMPLATE =
           "      grad_node->SetGradInMeta(%s, %d);\n";
       grad_node_creation_str += paddle::string::Sprintf(
-          SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position);
+          SET_GRAD_IN_META_TEMPLATE, LegalizeVarName(inplace_input_name),
+          output_position);
 
       // Intermediate Tensor does not require CheckAndRetainGrad
       if (!output.intermediate()) {
         VLOG(6) << "Generated Call RetainGradForTensor";
         const char* RETAIN_GRAD_TEMPLATE =
             "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name);
+        grad_node_creation_str += paddle::string::Sprintf(
+            RETAIN_GRAD_TEMPLATE, LegalizeVarName(inplace_input_name));
       }
     } else {
-      const std::string& output_autograd_name = "p_autograd_" + output_name;
+      const std::string& output_autograd_name =
+          "p_autograd_" + LegalizeVarName(output_name);
       size_t output_position = fwd_outputs_name_pos_map.at(output_name);
 
       // Intermediate Tensor does not require SetHistory, nor RetainGrad
@@ -1270,7 +1279,8 @@ static std::string GenerateGradNodeCreationContent(
         const char* SET_GRAD_IN_META_TEMPLATE =
             "      grad_node->SetGradInMeta(%s, %d);\n";
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+            SET_GRAD_IN_META_TEMPLATE, LegalizeVarName(output_name),
+            output_position);
 
       } else {
         pass_stop_gradient_args += ", " + output_autograd_name;
@@ -1289,7 +1299,8 @@ static std::string GenerateGradNodeCreationContent(
         const char* SET_GRAD_IN_META_TEMPLATE =
             "      grad_node->SetGradInMeta(%s, %d);\n";
         grad_node_creation_str += paddle::string::Sprintf(
-            SET_GRAD_IN_META_TEMPLATE, output_name, output_position);
+            SET_GRAD_IN_META_TEMPLATE, LegalizeVarName(output_name),
+            output_position);
       }
 
       // Intermediate Tensor does not require CheckAndRetainGrad
@@ -1297,8 +1308,8 @@ static std::string GenerateGradNodeCreationContent(
         VLOG(6) << "Generated Call RetainGradForTensor";
         const char* RETAIN_GRAD_TEMPLATE =
             "      egr::EagerUtils::CheckAndRetainGrad(%s);\n";
-        grad_node_creation_str +=
-            paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name);
+        grad_node_creation_str += paddle::string::Sprintf(
+            RETAIN_GRAD_TEMPLATE, LegalizeVarName(output_name));
       }
     }
   }
@@ -1421,9 +1432,10 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     if (input.duplicable()) {
       const char* FWD_INS_ARG_TEMPLATE =
           "const std::vector<paddle::experimental::Tensor>& %s";
-      input_args_str_list[input_position] =
-          paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
-      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;
+      input_args_str_list[input_position] = paddle::string::Sprintf(
+          FWD_INS_ARG_TEMPLATE, LegalizeVarName(input_name));
+      amp_function_call_args_str_list[input_position] =
+          " NEW_" + LegalizeVarName(input_name);
 
       core_ops_args_type_info[op_type][input_position] = "list";
     } else {
@@ -1442,9 +1454,10 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       if (!flag_find_input_name) {
         FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s";
       }
-      input_args_str_list[input_position] =
-          paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name);
-      amp_function_call_args_str_list[input_position] = " NEW_" + input_name;
+      input_args_str_list[input_position] = paddle::string::Sprintf(
+          FWD_INS_ARG_TEMPLATE, LegalizeVarName(input_name));
+      amp_function_call_args_str_list[input_position] =
+          " NEW_" + LegalizeVarName(input_name);
 
       core_ops_args_type_info[op_type][input_position] = "tensor";
     }
@@ -1454,8 +1467,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
     const char* FWD_INS_CONTENT_TEMPLATE =
         "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-    ins_contents_str += paddle::string::Sprintf(FWD_INS_CONTENT_TEMPLATE,
-                                                input_name, input_name);
+    ins_contents_str += paddle::string::Sprintf(
+        FWD_INS_CONTENT_TEMPLATE, input_name, LegalizeVarName(input_name));
     if (input.duplicable()) {
       const char* AMP_TENSORS_VECTOR_TEMPLATE = "%s,";
       amp_tensors_vector_str +=
@@ -1464,16 +1477,18 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
           "    auto NEW_%s = egr::AmpAutoCasts(\"%s\", %s, amp_dst_dtype, "
           "\"%s\");\n";
       amp_auto_cast_str += paddle::string::Sprintf(
-          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+          AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name), input_name,
+          LegalizeVarName(input_name), op_type);
     } else {
       const char* AMP_TENSORS_VECTOR_TEMPLATE = "{%s},";
-      amp_tensors_vector_str +=
-          paddle::string::Sprintf(AMP_TENSORS_VECTOR_TEMPLATE, input_name);
+      amp_tensors_vector_str += paddle::string::Sprintf(
+          AMP_TENSORS_VECTOR_TEMPLATE, LegalizeVarName(input_name));
       const char* AMP_AUTO_CAST_TEMPLATE =
           "    auto NEW_%s = egr::AmpAutoCast(\"%s\", %s, amp_dst_dtype, "
           "\"%s\");\n";
       amp_auto_cast_str += paddle::string::Sprintf(
-          AMP_AUTO_CAST_TEMPLATE, input_name, input_name, input_name, op_type);
+          AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name), input_name,
+          LegalizeVarName(input_name), op_type);
     }
   }
   if (ins_contents_str.size() > 0)
@@ -1509,35 +1524,41 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
             "  if(%s.size() > 0) "
             "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         dispensable_ins_contents_str += paddle::string::Sprintf(
-            FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+            FWD_INS_CONTENT_TEMPLATE, LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name));
         const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
             "    if(%s.size() > 0) "
             "amp_tensors_vector.push_back(%s);\n";
         dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
-            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name));
         const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
             "    auto NEW_%s = ((%s.size() > 0) ? egr::AmpAutoCasts(\"%s\", "
             "%s, amp_dst_dtype, \"%s\") : %s);\n";
         dispensable_amp_auto_cast_str += paddle::string::Sprintf(
-            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
-            input_name, input_name, op_type, input_name);
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name), op_type, LegalizeVarName(input_name));
       } else {
         const char* FWD_INS_CONTENT_TEMPLATE =
             "  if(%s.initialized()) "
             "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n";
         dispensable_ins_contents_str += paddle::string::Sprintf(
-            FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name);
+            FWD_INS_CONTENT_TEMPLATE, LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name));
         const char* FWD_AMP_TENSORS_VECTOR_TEMPLATE =
             "    if(%s.initialized()) "
             "amp_tensors_vector.push_back({ %s });\n";
         dispensable_amp_tensors_vector_str += paddle::string::Sprintf(
-            FWD_AMP_TENSORS_VECTOR_TEMPLATE, input_name, input_name);
+            FWD_AMP_TENSORS_VECTOR_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name));
         const char* DISPENSABLE_AMP_AUTO_CAST_TEMPLATE =
             "    auto NEW_%s = ((%s.initialized()) ? egr::AmpAutoCast(\"%s\", "
             "%s, amp_dst_dtype, \"%s\") : %s);\n";
         dispensable_amp_auto_cast_str += paddle::string::Sprintf(
-            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, input_name, input_name,
-            input_name, input_name, op_type, input_name);
+            DISPENSABLE_AMP_AUTO_CAST_TEMPLATE, LegalizeVarName(input_name),
+            LegalizeVarName(input_name), input_name,
+            LegalizeVarName(input_name), op_type, LegalizeVarName(input_name));
       }
     }
   }
@@ -1559,18 +1580,18 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       if (output.duplicable()) {
         const char* FWD_NUM_ARG_TEMPLATE =
             ", std::vector<paddle::experimental::Tensor*>& %s";
-        std::string arg_str =
-            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
+        std::string arg_str = paddle::string::Sprintf(
+            FWD_NUM_ARG_TEMPLATE, LegalizeVarName(output_var_name));
         dygraph_function_args_str += arg_str;
-        amp_function_call_args_str += (", " + output_var_name);
+        amp_function_call_args_str += (", " + LegalizeVarName(output_var_name));
 
         core_ops_args_type_info[op_type].push_back("list");
       } else {
         const char* FWD_NUM_ARG_TEMPLATE = ", paddle::experimental::Tensor* %s";
-        std::string arg_str =
-            paddle::string::Sprintf(FWD_NUM_ARG_TEMPLATE, output_var_name);
+        std::string arg_str = paddle::string::Sprintf(
+            FWD_NUM_ARG_TEMPLATE, LegalizeVarName(output_var_name));
         dygraph_function_args_str += arg_str;
-        amp_function_call_args_str += (", " + output_var_name);
+        amp_function_call_args_str += (", " + LegalizeVarName(output_var_name));
 
         core_ops_args_type_info[op_type].push_back("tensor");
       }
@@ -1586,8 +1607,9 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
       } else {
         const char* FWD_OUTS_CONTENT_TEMPLATE =
             "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },";
-        outs_contents_str += paddle::string::Sprintf(
-            FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name);
+        outs_contents_str +=
+            paddle::string::Sprintf(FWD_OUTS_CONTENT_TEMPLATE, output_name,
+                                    LegalizeVarName(output_var_name));
       }
       core_ops_args_info[op_type].push_back(output_name);
 
@@ -1649,7 +1671,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     std::string amp_logic_str = "";
     if (in_vars.size() != 0) {
       const char* AMP_TENSORS_VECTOR_TEMPLATE =
-          "    std::vector<std::vector<paddle::experimental::Tensor>> "
+          "    paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+          "egr::kSlotSmallVectorSize> "
           "amp_tensors_vector = { "
           "%s };\n";
       std::string amp_tensors_vector = paddle::string::Sprintf(
@@ -1781,7 +1804,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
   std::vector<std::string> return_types(output_size);
   for (const proto::OpProto::Var& output : out_vars) {
     const std::string& output_name = output.name();
-    const std::string output_var_args_name = output_name + "Var";
+    const std::string output_var_args_name =
+        LegalizeVariableName(output_name + "Var");
     std::string out_tensor_str;
     size_t return_position = fwd_outputs_name_pos_map.at(output_name);
     std::string output_varname = LegalizeVariableName(output_name);
@@ -1845,9 +1869,11 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
               "  %s.bump_inplace_version();\n"
               "  VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace "
               "Strategy.\";\n";
-          out_tensor_str = paddle::string::Sprintf(
-              FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name,
-              inplace_input_name, inplace_input_name);
+          out_tensor_str =
+              paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_name,
+                                      LegalizeVarName(inplace_input_name),
+                                      LegalizeVarName(inplace_input_name),
+                                      LegalizeVarName(inplace_input_name));
         } else {
           const char* FWD_OUT_TENSOR_TEMPLATE =
               "  paddle::experimental::Tensor %s;\n"
@@ -1862,7 +1888,8 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
 
     if (!inplace_map.empty() && inplace_map.count(output_name)) {
       // Replace output directly with input in inplace op.
-      return_contents[return_position] = inplace_map[output_name];
+      return_contents[return_position] =
+          LegalizeVarName(inplace_map[output_name]);
     } else {
       return_contents[return_position] = output_varname;
     }
@@ -2428,9 +2455,11 @@ static std::string GenerateGradNodeCCContents(
   }
 
   const char* BWD_RETURN_TEMPLATE =
-      "  std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> hooked_grads = "
       "GradNode%s::ApplyGradientHooks(grads);\n"
-      "  std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
+      "  paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> outputs(%d);\n"
       "  %s\n"
       "  if(NeedComplexToRealConversion()) "
       "HandleComplexGradToRealGrad(&outputs);\n"
@@ -2441,9 +2470,11 @@ static std::string GenerateGradNodeCCContents(
 
   // [Generation] Get Full Grad Function
   const char* GRAD_FUNCTION_TEMPLATE =
-      "std::vector<std::vector<paddle::experimental::Tensor>> "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> "
       "GradNode%s::operator()("
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize>& grads, bool "
       "create_graph, bool is_new_grad) {\n"
       "%s"
       "%s"
@@ -2487,9 +2518,12 @@ static std::string GenerateGradNodeHeaderContents(
       "Construct GradNode%s \"; }\n"
       "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
-      "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
+      "  virtual "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize> "
       "operator()("
-      "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
+      "paddle::small_vector<std::vector<paddle::experimental::Tensor>, "
+      "egr::kSlotSmallVectorSize>& grads, bool "
       "create_graph = false, bool is_new_grad = false) "
       "override;\n"
       "\n"
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 7769c5371baba..8c98d9fa275dc 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -22,17 +22,12 @@
 ### Global Variables ###
 ########################
 ops_to_fill_zero_for_empty_grads = set([
-    "split_grad",
-    "rnn_grad",
-    "matmul_double_grad",
-    "matmul_triple_grad",
-    "sigmoid_double_grad",
-    "sigmoid_triple_grad",
-    "add_double_grad",
-    "add_triple_grad",
-    "multiply_double_grad",
-    "multiply_triple_grad",
-    "conv2d_grad_grad",
+    "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
+    "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad",
+    "add_triple_grad", "multiply_double_grad", "multiply_triple_grad",
+    "conv2d_grad_grad", "batch_norm_double_grad", "tanh_double_grad",
+    "tanh_triple_grad", "subtract_double_grad", "divide_double_grad",
+    "log_double_grad", "elu_double_grad", "leaky_relu_double_grad"
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 078f1b30398ed..00b9aa7a231a3 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -118,8 +118,8 @@ class {} : public egr::GradNodeBase {{
       egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {{}}
   ~{}() override = default;
 
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false, bool is_new_grad = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> operator()(
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph = false, bool is_new_grad = false) override;
   std::string name() override {{ return \"{}\"; }}
   
   void ClearTensorWrappers() override {{
@@ -149,7 +149,7 @@ class {} : public egr::GradNodeBase {{
 
 GRAD_FUNCTION_TEMPLATE = \
 """
-std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph, bool is_new_grad) {{
+paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> {}::operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) {{
     // Fill Zero For GradIn Tensors
 {}
 
@@ -239,7 +239,6 @@ class {} : public egr::GradNodeBase {{
       // Set TensorWrappers for Forward Inputs
 {}
       // SetGradOutMeta & SetEdges
-{}
 {}
       // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
 {}
@@ -356,7 +355,7 @@ class {} : public egr::GradNodeBase {{
     if (egr::Controller::Instance().GetAMPLevel() != paddle::imperative::AmpLevel::O0) {{
         VLOG(5) << "Check and Prepare For AMP";
         {}
-        std::vector<std::vector<paddle::experimental::Tensor>> amp_tensors_vector = {};
+        paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> amp_tensors_vector = {};
         {}
         {}
         {}
@@ -769,15 +768,11 @@ def GenerateNodeCreationCodes(self):
             is_optional = (name in self.optional_inputs)
             if is_optional:
                 set_grad_out_meta = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
-                set_edges = f"{indent}if({name}.get_ptr() != nullptr)  grad_node->AddEdges({input_autograd_meta_name}, {pos});"
             else:
                 set_grad_out_meta = f"{indent}grad_node->SetGradOutMeta({name}, {pos});"
-                set_edges = f"{indent}grad_node->AddEdges({input_autograd_meta_name}, {pos});"
 
             set_grad_out_meta_list.append(set_grad_out_meta)
-            set_edges_list.append(set_edges)
         set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
-        set_edges_str = "\n".join(set_edges_list)
 
         # SetOutRank & SetHistory & SetGradInMeta
         set_out_rank_list = []
@@ -808,7 +803,7 @@ def GenerateNodeCreationCodes(self):
         self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
             node_creation_event_str, pass_stop_gradient_args_str,
             node_construction_str, set_attributes_str,
-            set_input_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
+            set_input_tensor_wrappers_str, set_grad_out_meta_str,
             set_out_rank_str, set_history_str, set_grad_in_meta_str,
             set_retain_grad_str, set_output_tensor_wrappers_str)
 
@@ -1454,7 +1449,7 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
 
         # Construct grad_api returns
         slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
-        returns_str = f"{indent}std::vector<std::vector<paddle::experimental::Tensor>> returns({slot_num_bwd_outputs});\n"
+        returns_str = f"{indent}paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});\n"
         for name, (ttype, fwd_position,
                    grad_api_position) in backward_grad_outputs_map.items():
             transformed_tensor_name = self.TransformToNextGradName(name)
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 7ca1b49bcbc8b..7a4e7f81611d1 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -169,9 +169,12 @@ class GeneralGrad {
           input_target_nodes_inputmeta_map.count(node);
 
       // Find and append next nodes
-      const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-      for (const auto& edge_list : edges) {
-        for (const Edge& edge : edge_list) {
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& metas =
+          node->OutputMeta();
+      for (const auto& meta_list : metas) {
+        for (const GradSlotMeta& meta : meta_list) {
+          const auto& edge = meta.GetEdge();
           GradNodeBase* next_node = edge.GetMutableGradNode().get();
 
           // Next node could be nullptr if it is leaf tensor with no
@@ -381,13 +384,15 @@ class GeneralGrad {
               "unable to find copied target for certain grad node."));
       GradNodeBase* copied_node = orig_to_copied_node_mapping_[orig_node].get();
 
-      const std::vector<std::vector<Edge>>& orig_edges = orig_node->GetEdges();
-      std::vector<std::vector<Edge>>& copied_edges =
-          copied_node->GetMutableEdges();
-      for (size_t i = 0; i < orig_edges.size(); i++) {
-        for (size_t j = 0; j < orig_edges[i].size(); j++) {
-          const Edge& orig_edge = orig_edges[i][j];
-          Edge& copied_edge = copied_edges[i][j];
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& orig_meta =
+          orig_node->OutputMeta();
+      paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+          copied_edges = copied_node->MutableOutputMeta();
+      for (size_t i = 0; i < orig_meta.size(); i++) {
+        for (size_t j = 0; j < orig_meta[i].size(); j++) {
+          const Edge& orig_edge = orig_meta[i][j].GetEdge();
+          Edge& copied_edge = copied_edges[i][j].GetMutableEdge();
 
           std::shared_ptr<GradNodeBase> orig_next_node =
               orig_edge.GetMutableGradNode();
@@ -468,9 +473,11 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
             "We got null node when we traverse the backward graph, and this "
             "should not happened please check your code and contact us."));
     // Find and append next nodes
-    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-    for (const auto& edge_list : edges) {
-      for (const Edge& edge : edge_list) {
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        metas = node->OutputMeta();
+    for (const auto& meta_list : metas) {
+      for (const GradSlotMeta& meta : meta_list) {
+        const auto& edge = meta.GetEdge();
         GradNodeBase* next_node = edge.GetMutableGradNode().get();
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
@@ -546,7 +553,13 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   for (size_t i = 0; i < tensors.size(); i++) {
     const paddle::experimental::Tensor& tensor = tensors[i];
 
-    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(tensor);
+    AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
+    if (auto_grad_meta == nullptr) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << tensor.name();
+      continue;
+    }
     // Get grad input info from target tensors
     auto input_info = auto_grad_meta->OutRankInfo();
 
@@ -689,8 +702,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
     // Run Pre Backward Node and get outputs
-    std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
-        (*node)(node_input_buffer->Buffers(), create_graph, is_general_grad);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>
+        grad_output_tensors = (*node)(node_input_buffer->Buffers(),
+                                      create_graph, is_general_grad);
 
     // retain_grad or not
     if (!retain_graph) {
@@ -704,17 +719,18 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     node_input_buffers_dict.erase(node);
 
     // Prepare GradTensorHolder for next node
-    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
-    PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        metas = node->OutputMeta();
+    PADDLE_ENFORCE(metas.size() == grad_output_tensors.size() || metas.empty(),
                    paddle::platform::errors::Fatal(
                        "Number of edges should be either empty ( for leaf node "
                        ") or the same as number of output grad tensors, but we "
                        "got edges size is: %d, grad_output size is: %d",
-                       edges.size(), grad_output_tensors.size()));
+                       metas.size(), grad_output_tensors.size()));
 
-    for (size_t i = 0; i < edges.size(); i++) {
-      for (size_t j = 0; j < edges[i].size(); j++) {
-        const Edge& edge = edges[i][j];
+    for (size_t i = 0; i < metas.size(); i++) {
+      for (size_t j = 0; j < metas[i].size(); j++) {
+        const Edge& edge = metas[i][j].GetEdge();
         if (!edge.IsInitialized()) {
           continue;
         }
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index a9a41c106d090..2bb86a86e8348 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -19,10 +19,12 @@
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
-std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph, bool is_new_grad) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+RunCustomOpNode::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,
+    bool create_graph, bool is_new_grad) {  // NOLINT
   paddle::CustomOpKernelContext ctx;
   auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
       egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
@@ -31,8 +33,9 @@ operator()(
   auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
   auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
 
-  std::vector<std::vector<paddle::experimental::Tensor>> tmp_ins(
-      grad_inputs_name.size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_ins(grad_inputs_name.size());
   VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
           << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
   for (size_t i = 0; i < grads.size(); i++) {
@@ -58,17 +61,19 @@ operator()(
   }
   VLOG(6) << "Prepare Grad attrs";
   ctx.EmplaceBackAttrs(attrs_);
-  std::vector<std::vector<paddle::experimental::Tensor>> outs(
-      GetEdges().size());
-  std::vector<std::vector<paddle::experimental::Tensor>> tmp_outs(
-      grad_outputs_names.size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(OutputMeta().size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_outs(grad_outputs_names.size());
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
-  for (size_t i = 0; i < GetEdges().size(); i++) {
+  for (size_t i = 0; i < OutputMeta().size(); i++) {
     if (map[0].find(i) != map[0].end()) {
       VLOG(7) << "Insert grad outputs: " << i
-              << " with size: " << GetEdges()[i].size()
+              << " with size: " << OutputMeta()[i].size()
               << " to tmp_outputs: " << map[0][i];
-      for (size_t j = 0; j < GetEdges()[i].size(); j++) {
+      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
         outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
                              std::make_shared<phi::DenseTensor>(
                                  phi::DataType::UNDEFINED),
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index 2e7885001c385..4801088e51ba5 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -36,10 +36,13 @@ class RunCustomOpNode : public GradNodeBase {
   }
 
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>>
-  operator()(                                                         // NOLINT
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false, bool is_new_grad = false)            // NOLINT
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(  // NOLINT
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>& grads,  // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false)  // NOLINT
       override;
 
   std::string name() {
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 5b4921320f6b0..610b177829e2f 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -40,70 +40,20 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
   VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
-  adj_edges_.resize(bwd_out_slot_num);
 }
 
-void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
-  PADDLE_ENFORCE_LT(
-      slot_id, adj_edges_.size(),
-      paddle::platform::errors::InvalidArgument(
-          "Given slot id is out of range of adj_edges outter size, "
-          "adj_edges is designed to has the same size of grad "
-          "inputs's slot num."));
-
-  for (size_t i = 0; i < metas->size(); i++) {
-    const auto& meta = (*metas)[i];
-    // adj_edges has as same rank as fwd inputs, and record it's output rank
-    // from
-    // its pre-ops
-    if (meta && !meta->StopGradient()) {
-      auto node = meta->GetMutableGradNode();
-      if (!node || !node.get()) {
-        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-      }
-      VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-              << this->name() << " (addr: " << this << ") "
-              << " to " << meta->GetMutableGradNode()->name()
-              << " (addr: " << meta->GetMutableGradNode().get() << ")";
-
-      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                       meta->OutRankInfo());
-    } else {
-      adj_edges_[slot_id].emplace_back();
-    }
-  }
-}
-
-void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
-  PADDLE_ENFORCE_LT(
-      slot_id, adj_edges_.size(),
-      paddle::platform::errors::InvalidArgument(
-          "Given slot id is out of range of adj_edges outter size, "
-          "adj_edges is designed to has the same size of grad "
-          "inputs's slot num."));
-
-  if (meta && !meta->StopGradient()) {
-    auto node = meta->GetMutableGradNode();
-    if (!node || !node.get()) {
-      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
-    }
-    VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
-            << this->name() << " (addr: " << this << ") "
-            << " to " << meta->GetMutableGradNode()->name()
-            << " (addr: " << meta->GetMutableGradNode().get() << ")";
-
-    adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
-                                     meta->OutRankInfo());
-  } else {
-    adj_edges_[slot_id].emplace_back();
-  }
+const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+GradNodeBase::InputMeta() const {
+  return bwd_in_meta_;
 }
 
-const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::InputMeta() const {
-  return bwd_in_meta_;
+const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+GradNodeBase::OutputMeta() const {
+  return bwd_out_meta_;
 }
 
-const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
+paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+GradNodeBase::MutableOutputMeta() {
   return bwd_out_meta_;
 }
 
@@ -123,7 +73,9 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   }
 
   auto& meta = metas[0];
-  meta.SetStopGradient(fwd_out_meta->StopGradient());
+  if (fwd_out_meta && fwd_out_meta->StopGradient()) {
+    meta.SetStopGradient(fwd_out_meta->StopGradient());
+  }
 
   if (!fwd_out.initialized()) {
     VLOG(6)
@@ -153,8 +105,8 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   meta.SetTensorMeta(dense_tensor->meta());
   meta.SetPlace(fwd_out.place());
 
-  if (paddle::framework::IsComplexType(
-          paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+  if (dense_tensor->type() == paddle::experimental::DataType::COMPLEX64 ||
+      dense_tensor->type() == paddle::experimental::DataType::COMPLEX128) {
     need_complex_to_real_ = true;
   }
 }
@@ -186,7 +138,7 @@ void GradNodeBase::SetGradInMeta(
                                 "Bwd_in_meta should only be called while "
                                 "autograd_meta is not null. If you got this "
                                 "error, it indicates bugs in framework."));
-    if (fwd_out_meta->StopGradient()) {
+    if (fwd_out_meta && fwd_out_meta->StopGradient()) {
       // Set Stop Gradient only when its true or non-initialized autograd_meta,
       // since all default value is false.
       meta.SetStopGradient(fwd_out_meta->StopGradient());
@@ -212,8 +164,8 @@ void GradNodeBase::SetGradInMeta(
       meta.SetTensorMeta(dense_tensor->meta());
       meta.SetPlace(fwd_out_tensor.place());
 
-      if (paddle::framework::IsComplexType(
-              paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
+      if (dense_tensor->type() == paddle::experimental::DataType::COMPLEX64 ||
+          dense_tensor->type() == paddle::experimental::DataType::COMPLEX128) {
         need_complex_to_real_ = true;
       }
     } else {
@@ -238,12 +190,24 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
     metas.resize(1);
   }
   auto& meta = metas[0];
+  // Set Stop_gradient
   if (fwd_in_meta) {
     meta.SetStopGradient(fwd_in_meta->StopGradient());
-  } else {
-    meta.SetStopGradient(true);
   }
+  // Set Adj Edges
+  if (fwd_in_meta && !fwd_in_meta->StopGradient()) {
+    auto node = fwd_in_meta->GetMutableGradNode();
+    if (!node || !node.get()) {
+      fwd_in_meta->SetGradNode(
+          std::make_shared<egr::GradNodeAccumulation>(fwd_in_meta));
+    }
+    VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
+            << this->name() << " (addr: " << this << ") "
+            << " to " << fwd_in_meta->GetMutableGradNode()->name()
+            << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")";
 
+    meta.SetEdge(fwd_in_meta->GetMutableGradNode(), fwd_in_meta->OutRankInfo());
+  }
   // Record TensorMeta
   if (fwd_in.impl() && fwd_in.impl().get()) {
     if (phi::DenseTensor::classof(fwd_in.impl().get())) {
@@ -282,30 +246,43 @@ void GradNodeBase::SetGradOutMeta(
     const auto& fwd_in_tensor = fwd_in[i];
     auto& meta = metas[i];
     auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor);
+    // Set Stop_gradient
     if (fwd_in_meta) {
-      // Set Stop Gradient only when its true or non-initialized autograd_meta,
-      // since all default value is false.
       meta.SetStopGradient(fwd_in_meta->StopGradient());
     }
+    // Set Adj Edges
+    if (fwd_in_meta && !fwd_in_meta->StopGradient()) {
+      auto node = fwd_in_meta->GetMutableGradNode();
+      if (!node || !node.get()) {
+        fwd_in_meta->SetGradNode(
+            std::make_shared<egr::GradNodeAccumulation>(fwd_in_meta));
+      }
+      VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from "
+              << this->name() << " (addr: " << this << ") "
+              << " to " << fwd_in_meta->GetMutableGradNode()->name()
+              << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")";
 
+      meta.SetEdge(fwd_in_meta->GetMutableGradNode(),
+                   fwd_in_meta->OutRankInfo());
+    }
     // Record TensorMeta
     if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) {
       if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) {
         // Only Copy Meta
         phi::DenseTensor* dense_tensor =
             static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
-
         PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED,
                           paddle::platform::errors::Fatal(
-                              "Attempting to copy DenseTensorMeta with "
-                              "phi::DataType::UNDEFINED,"
+                              "Attempting to copy DenseTensorMeta "
+                              "with phi::DataType::UNDEFINED,"
                               "which is illegal."));
         meta.SetTensorMeta(dense_tensor->meta());
         meta.SetPlace(fwd_in_tensor.place());
       }
     } else {
-      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
-                 "with non-DenseTensor argument.";
+      VLOG(6)
+          << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+             "non-DenseTensor argument.";
     }
   }
 }
@@ -328,18 +305,14 @@ int64_t GradNodeBase::RegisterGradientHook(
   return next_hook_id_++;
 }
 
-const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
-  return adj_edges_;
-}
-
-std::vector<std::vector<Edge>>& GradNodeBase::GetMutableEdges() {
-  return adj_edges_;
-}
-
-std::vector<std::vector<paddle::experimental::Tensor>>
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
 GradNodeBase::ApplyGradientHooks(
-    const std::vector<std::vector<paddle::experimental::Tensor>>& tensors) {
-  std::vector<std::vector<paddle::experimental::Tensor>> outs(tensors.size());
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& tensors) {
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(tensors.size());
   for (auto& hook_pair : gradient_hooks_) {
     size_t slot_id = std::get<0>(hook_pair.second);
     size_t rank = std::get<1>(hook_pair.second);
@@ -386,7 +359,8 @@ GradNodeBase::ApplyGradientHooks(
 }
 
 void GradNodeBase::HandleComplexGradToRealGrad(
-    std::vector<std::vector<paddle::experimental::Tensor>>* out_grads) {
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>* out_grads) {
   for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) {
     const std::vector<paddle::experimental::Tensor>& slot_out_grads =
         (*out_grads)[slot_id];
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 07b62082f55ec..6fdee203c196c 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/api/all.h"
@@ -46,9 +47,8 @@ namespace egr {
  * indicate which
  * input of grad this edge belong).
  * */
-class Edge;
 class AutogradMeta;
-
+class GradNodeBase;
 /**
  * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle
  * has lots of operators
@@ -56,6 +56,70 @@ class AutogradMeta;
  * So, we need a meta info
  * to record it's needs.
  * **/
+class Edge {
+ public:
+  // Default constructor for Edges in order to construct it for AutogradMeta
+  Edge() : in_slot_id_(0), in_rank_(0), grad_node_(nullptr) {}
+
+  // In real use cases we should create Edge from grad node and input rank which
+  // indicate which edge it is.
+  // Since we have slot design in operators we will have to locate an edge with
+  // slot
+  // and rank.
+  Edge(const std::shared_ptr<GradNodeBase>& grad_node, size_t in_slot_id,
+       size_t in_rank)
+      : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {}
+
+  Edge(const std::shared_ptr<GradNodeBase>& grad_node,
+       const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info)
+      : in_slot_id_(rank_info.first),
+        in_rank_(rank_info.second),
+        grad_node_(grad_node) {}
+
+  GradNodeBase* GetGradNode() const { return grad_node_.get(); }
+
+  std::shared_ptr<GradNodeBase> GetMutableGradNode() const {
+    return grad_node_;
+  }
+
+  void SetGradNode(const std::shared_ptr<GradNodeBase>& node) {
+    VLOG(6) << "Reseting Edge's Grad Node";
+    grad_node_ = node;
+  }
+
+  std::pair<size_t, size_t> GetEdgeRankInfo() const {
+    return std::make_pair(in_slot_id_, in_rank_);
+  }
+
+  void SetEdgeRankInfo(size_t slot_id, size_t in_rank) {
+    in_slot_id_ = slot_id;
+    in_rank_ = in_rank;
+  }
+
+  void SetEdgeRankInfo(
+      const std::pair</* slot_id */ size_t, /* rank */ size_t>& edge_rank) {
+    in_slot_id_ = edge_rank.first;
+    in_rank_ = edge_rank.second;
+  }
+
+  // Currently we use grad_node_ to identify if a edge is initialized.
+  bool IsInitialized() const {
+    if (!grad_node_) {
+      return false;
+    } else {
+      if (!(grad_node_.get())) {
+        return false;
+      } else {
+        return true;
+      }
+    }
+  }
+
+ private:
+  size_t in_slot_id_;
+  size_t in_rank_;
+  std::shared_ptr<GradNodeBase> grad_node_{nullptr};
+};
 class GradSlotMeta {
  public:
   GradSlotMeta() = default;
@@ -81,10 +145,21 @@ class GradSlotMeta {
   void SetPlace(const phi::Place& place) { place_ = place; }
   const phi::Place& GetPlace() const { return place_; }
 
+  void SetEdge(const Edge& edge) { adj_edge_ = edge; }
+  void SetEdge(
+      const std::shared_ptr<GradNodeBase>& grad_node,
+      const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info) {
+    adj_edge_.SetGradNode(grad_node);
+    adj_edge_.SetEdgeRankInfo(rank_info);
+  }
+  Edge& GetMutableEdge() { return adj_edge_; }
+  const Edge& GetEdge() const { return adj_edge_; }
+
  private:
   bool stop_gradient_{false};
   phi::Place place_;
   std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
+  Edge adj_edge_;
 };
 
 class GradNodeBase {
@@ -107,9 +182,12 @@ class GradNodeBase {
    * so, vector of vector
    * is better choice to fit this format.
    * **/
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false, bool is_new_grad = false) = 0;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) = 0;
 
   virtual void ClearTensorWrappers() = 0;
 
@@ -118,17 +196,6 @@ class GradNodeBase {
        * **/
   virtual std::shared_ptr<GradNodeBase> Copy() const = 0;
 
-  /**
-   * AddEdges is designed to set input tensors' backward Node as current
-   * node's Edges.
-   * This method should be call in forward code and for double backward depends
-   * computation.
-   *
-   * This one is called slot by slot
-   * **/
-  void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
-  void AddEdges(AutogradMeta* meta, size_t slot_id);
-
   // adj_edges were moved inside OutputMeta(), so no available direct access
   // from GradNodeBase.
   // To access Edges, get GradSlotMeta by calling OutputMeta(), then use
@@ -136,10 +203,15 @@ class GradNodeBase {
 
   /**
    * Get Input Meta of current Grad node**/
-  const std::vector<std::vector<GradSlotMeta>>& InputMeta() const;
+  const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+  InputMeta() const;
   /**
    * Get Output Meta of current Grad node**/
-  const std::vector<std::vector<GradSlotMeta>>& OutputMeta() const;
+  const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+  OutputMeta() const;
+
+  paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+  MutableOutputMeta();
   /**
    * Set bwd ins and outs info with forward vars
    * **/
@@ -180,23 +252,22 @@ class GradNodeBase {
    * **/
   inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); }
 
-  std::vector<std::vector<paddle::experimental::Tensor>> ApplyGradientHooks(
-      const std::vector<std::vector<paddle::experimental::Tensor>>& tensors);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+  ApplyGradientHooks(
+      const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                 kSlotSmallVectorSize>& tensors);
 
   /**
     * Handle Complex - Real Type Promotion
     * **/
   void HandleComplexGradToRealGrad(
-      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads);
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>* out_grads);
   bool NeedComplexToRealConversion() { return need_complex_to_real_; }
 
   virtual std::string name() { return "GradNodeBase"; }
 
-  /**
-       * GetEdges is designed to get all edges of current node**/
-  const std::vector<std::vector<Edge>>& GetEdges() const;
-  std::vector<std::vector<Edge>>& GetMutableEdges();
-
   /**
        * The following interfaces are designed for no_need_buffer
        * **/
@@ -207,18 +278,13 @@ class GradNodeBase {
   }
 
  private:
-  // TODO(zhanlve): Merge adj_edges_ into GradOutMeta
-  // Edges recorded the backward related node info, which indicate all edges
-  // linked
-  // by this Grad Node.
-  // Why we need vector<vector<Edge>>: Edges is as same rank as bwd output.
-  std::vector<std::vector<Edge>> adj_edges_;
-
   // bwd_out_meta_ is used to record Grad output info for backward
-  std::vector<std::vector<GradSlotMeta>> bwd_out_meta_;
+  paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>
+      bwd_out_meta_;
 
   // bwd_in_meta_ used to record Grad input info for backward
-  std::vector<std::vector<GradSlotMeta>> bwd_in_meta_;
+  paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>
+      bwd_in_meta_;
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
@@ -235,71 +301,6 @@ class GradNodeBase {
   bool is_tensor_wrappers_cleared_ = false;
 };
 
-class Edge {
- public:
-  // Default constructor for Edges in order to construct it for AutogradMeta
-  Edge() : in_slot_id_(0), in_rank_(0), grad_node_(nullptr) {}
-
-  // In real use cases we should create Edge from grad node and input rank which
-  // indicate which edge it is.
-  // Since we have slot design in operators we will have to locate an edge with
-  // slot
-  // and rank.
-  Edge(const std::shared_ptr<GradNodeBase>& grad_node, size_t in_slot_id,
-       size_t in_rank)
-      : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {}
-
-  Edge(const std::shared_ptr<GradNodeBase>& grad_node,
-       const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info)
-      : in_slot_id_(rank_info.first),
-        in_rank_(rank_info.second),
-        grad_node_(grad_node) {}
-
-  GradNodeBase* GetGradNode() const { return grad_node_.get(); }
-
-  std::shared_ptr<GradNodeBase> GetMutableGradNode() const {
-    return grad_node_;
-  }
-
-  void SetGradNode(const std::shared_ptr<GradNodeBase>& node) {
-    VLOG(6) << "Reseting Edge's Grad Node";
-    grad_node_ = node;
-  }
-
-  std::pair<size_t, size_t> GetEdgeRankInfo() const {
-    return std::make_pair(in_slot_id_, in_rank_);
-  }
-
-  void SetEdgeRankInfo(size_t slot_id, size_t in_rank) {
-    in_slot_id_ = slot_id;
-    in_rank_ = in_rank;
-  }
-
-  void SetEdgeRankInfo(
-      const std::pair</* slot_id */ size_t, /* rank */ size_t>& edge_rank) {
-    in_slot_id_ = edge_rank.first;
-    in_rank_ = edge_rank.second;
-  }
-
-  // Currently we use grad_node_ to identify if a edge is initialized.
-  bool IsInitialized() const {
-    if (!grad_node_) {
-      return false;
-    } else {
-      if (!(grad_node_.get())) {
-        return false;
-      } else {
-        return true;
-      }
-    }
-  }
-
- private:
-  size_t in_slot_id_;
-  size_t in_rank_;
-  std::shared_ptr<GradNodeBase> grad_node_{nullptr};
-};
-
 inline void CheckTensor(const paddle::experimental::Tensor& pre,
                         const paddle::experimental::Tensor& post) {
   if (!pre.initialized() && post.initialized()) {
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index 80b7c59df8fa0..a9800afc626c9 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -27,7 +27,8 @@ namespace egr {
 class GradTensorHolder {
  public:
   explicit GradTensorHolder(
-      const std::vector<std::vector<GradSlotMeta>>& metas) {
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& metas) {
     VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size();
     buffer_.resize(metas.size());
     for (size_t i = 0; i < buffer_.size(); i++) {
@@ -39,7 +40,8 @@ class GradTensorHolder {
   GradTensorHolder(const GradTensorHolder& other) = default;
 
   explicit GradTensorHolder(
-      std::vector<std::vector<paddle::experimental::Tensor>>&& inputs)
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>&& inputs)
       : buffer_(std::move(inputs)) {}
 
   GradTensorHolder& operator=(const GradTensorHolder& other) = default;
@@ -56,14 +58,18 @@ class GradTensorHolder {
     return buffer_[pos];
   }
 
-  std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>&
+  Buffers() {
     return buffer_;
   }
 
   void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
 
  private:
-  std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      buffer_;
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
index 29e98483ed6cf..fad4fd50a5e3e 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.cc
+++ b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -29,14 +29,18 @@
 #include "pybind11/pytypes.h"
 
 namespace egr {
-std::vector<std::vector<paddle::experimental::Tensor>> GradNodePyLayer::
-operator()(
-    std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-    bool create_graph, bool is_new_grad) {
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+GradNodePyLayer::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,  // NOLINT
+    bool create_graph,
+    bool is_new_grad) {
   VLOG(3) << "Running Eager Backward Node: " << name();
 
-  std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-      GradNodePyLayer::ApplyGradientHooks(grads);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      hooked_grads = GradNodePyLayer::ApplyGradientHooks(grads);
 
   paddle::pybind::PyLayerObject* ctx =
       reinterpret_cast<paddle::pybind::PyLayerObject*>(ctx_);
@@ -124,7 +128,9 @@ operator()(
         ctx->forward_input_tensor_is_duplicable.size(), outputs_size));
   }
 
-  std::vector<std::vector<paddle::experimental::Tensor>> grad_out;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      grad_out;
   grad_out.reserve(ctx->forward_input_tensor_is_duplicable.size());
   for (size_t i = 0; i < ctx->forward_input_tensor_is_duplicable.size(); i++) {
     if (i < outputs_size) {
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index 40291afaba421..b477d7a9ad996 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -32,11 +32,17 @@ class GradNodePyLayer : public GradNodeBase {
     ctx_ = ctx;
   }
 
-  ~GradNodePyLayer() override { Py_DECREF(ctx_); };
+  ~GradNodePyLayer() override {
+    Py_DECREF(ctx_);
+    Py_XDECREF(outputs_);
+  };
 
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false, bool is_new_grad = false) override;
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override;
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 3ee1603a53ab4..f13fcfa990057 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -88,7 +88,7 @@ class TensorWrapper {
     } else {
       intermidiate_tensor_.set_impl(tensor.impl());
     }
-
+    // TODO(jiabin): This may has server performance issue
     intermidiate_tensor_.set_name(tensor.name() + "@Saved");
 
     auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index 6c6c7fd25e5e5..f9f00749dc87b 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -80,14 +80,18 @@ TEST(AccumulationNode, Tensor) {
   grad_meta->SetStopGradient(false);
 
   // operator()
-  std::vector<std::vector<paddle::experimental::Tensor>> et0_vec = {{et0}};
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      et0_vec = {{et0}};
   paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0];
   auto* ret_et0_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
           ->data<paddle::platform::float16>();
   CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
 
-  std::vector<std::vector<paddle::experimental::Tensor>> et1_vec = {{et1}};
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      et1_vec = {{et1}};
   paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0];
 
   auto* ret_et1_ptr =
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index d592b5ccf66ff..6687b6621ad54 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -34,7 +34,9 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(
       /* val */ 5.0, /* in_num */ 2, /* out_num */ 2);
   auto grad_test_node1 = std::make_shared<eager_test::GradTestNode>();
-  std::vector<std::vector<paddle::experimental::Tensor>> grads;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      grads;
   phi::DenseTensorMeta meta =
       phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1}));
   std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
@@ -51,28 +53,9 @@ void TestGradNodeBase(bool is_remove_gradient_hook) {
   CHECK_EQ(std::dynamic_pointer_cast<phi::DenseTensor>(res[0][0].impl())
                ->data<float>()[0],
            6.0f);
-  VLOG(6) << "Test Add Edges";
-  egr::Edge tmp_edge0(grad_test_node1, 1, 2);
-  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(tmp_edge0);
-  auto_grad0->SetStopGradient(false);
-
   egr::Edge tmp_edge1(grad_test_node1, 3, 4);
   auto auto_grad1 = std::make_shared<egr::AutogradMeta>(tmp_edge1);
   et1.set_autograd_meta(auto_grad1);
-  auto_grad1->SetStopGradient(false);
-  grad_test_node0->AddEdges(auto_grad0.get(), 0);
-
-  CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
-           size_t(1));
-  CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
-           size_t(2));
-  std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
-
-  grad_test_node0->AddEdges(&metas, 1);
-  CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
-           size_t(3));
-  CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().second,
-           size_t(4));
 
   VLOG(6) << "Test Set Meta and Get Meta";
   auto_grad1->SetStopGradient(true);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 6237944aa44f3..a00e629d1029a 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -31,9 +31,12 @@ class GradTestNode : public egr::GradNodeBase {
       : GradNodeBase(in_num, out_num), val_(val) {}
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
   std::string name() override { return "GradTestNode"; }
-  std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>>& grads,  // NOLINT
-      bool create_graph = false, bool is_new_grad = false) override {
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  egr::kSlotSmallVectorSize>& grads,  // NOLINT
+             bool create_graph = false,
+             bool is_new_grad = false) override {
     val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
                ->data<float>()[0];
     phi::DenseTensorMeta meta =
@@ -46,7 +49,9 @@ class GradTestNode : public egr::GradNodeBase {
     auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
     dt_ptr[0] = 6.0f;
     paddle::experimental::Tensor et1(dt);
-    std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        res = {{et1}};
     return res;
   }
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 7d2aafc63628e..0fe349294b438 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -45,7 +45,9 @@ TEST(GradTensorHolder, Constructor) {
       meta);
   paddle::experimental::Tensor et = paddle::experimental::Tensor(dt);
 
-  std::vector<std::vector<paddle::experimental::Tensor>> inputs;
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      inputs;
   inputs.push_back({et});
 
   GradTensorHolder grad_tensor_holder4 = GradTensorHolder(std::move(inputs));
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index 8c127efa4f7f3..7552ad83fa20f 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -76,8 +76,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta1->SetStopGradient(false);
 
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta({leaf_tensor}, 0);
   }
   std::vector<paddle::experimental::Tensor> outs = {target_tensor};
   // Run Backward
@@ -135,8 +134,7 @@ TEST(Backward, SingleNodeCustomGrad) {
         std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta1->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta({leaf_tensor}, 0);
   }
 
   // Run Backward
@@ -191,12 +189,12 @@ TEST(Backward, LinearNodes) {
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
     // Connect Node0 -> Node1 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node1_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node1_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor, 0);
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -208,8 +206,7 @@ TEST(Backward, LinearNodes) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
     auto_grad_meta1->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    node1_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   // Use Empty Grad Tensor
@@ -288,20 +285,20 @@ TEST(Backward, WithAccumulation) {
     auto_grad_meta1->SetStopGradient(false);
 
     // Connect Node0 -> Node2 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor0);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node2_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     // Connect Node1 -> Node2 via Edge
-    auto meta1 = egr::AutogradMeta();
-    meta1.SetStopGradient(false);
-    meta1.SetSingleOutRankWithSlot(0, 0);
-    meta1.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res1 = {&meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    auto tmp_tensor1 = paddle::experimental::Tensor();
+    auto* meta1 = EagerUtils::autograd_meta(&tmp_tensor1);
+    meta1->SetStopGradient(false);
+    meta1->SetSingleOutRankWithSlot(0, 0);
+    meta1->SetGradNode(node2_ptr);
+    node1_ptr->SetGradOutMeta(tmp_tensor1, 0);
 
     AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -314,7 +311,7 @@ TEST(Backward, WithAccumulation) {
 
     auto_grad_meta2->SetStopGradient(false);
     std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
-    node2_ptr->AddEdges(&res2, 0);
+    node2_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   Backward(target_tensors, grad_tensors);
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index 8b0759c17ed37..4337c0d092ca0 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -69,7 +69,7 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
   meta->SetSingleOutRankWithSlot(0, 0);
   meta->SetGradNode(acc_node_ptr);
   std::vector<egr::AutogradMeta*> res = {meta};
-  scale_node_ptr->AddEdges(&res, 0);
+  scale_node_ptr->SetGradOutMeta(leaf_tensor, 0);
 
   Backward(target_tensors, {});
 
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index 0bd1f3bdb36aa..bcb9820419d0f 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -251,10 +251,11 @@ TEST(EagerUtils, GetGradAccumulationNode) {
 }
 
 TEST(EagerUtils, FillZeroForEmptyGradInputs) {
-  std::vector<std::vector<paddle::experimental::Tensor>> grads = {
-      std::vector<paddle::experimental::Tensor>(1)};
-  std::vector<std::vector<GradSlotMeta>> slot_metas = {
-      std::vector<GradSlotMeta>(1)};
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       egr::kSlotSmallVectorSize>
+      grads = {std::vector<paddle::experimental::Tensor>(1)};
+  paddle::small_vector<std::vector<GradSlotMeta>, egr::kSlotSmallVectorSize>
+      slot_metas = {std::vector<GradSlotMeta>(1)};
 
   phi::DenseTensorMeta tensor_meta;
   tensor_meta.dtype = paddle::experimental::DataType::FLOAT32;
diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
index dc44d95daac1d..4cb316380aade 100644
--- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc
@@ -137,12 +137,16 @@ TEST(Forward, LinearNodes) {
 
     // 2. TensorWrapper: No TensorWrapper for ScaleNode
     // 3. NextEdges: Node 1 -> Node 0
-    const std::vector<std::vector<Edge>>& node1_edges = grad_node1->GetEdges();
-    const auto& node1_edge = node1_edges[0];
-
-    CHECK_EQ(static_cast<int>(node1_edge[0].GetEdgeRankInfo().first), 0);
-    CHECK_EQ(static_cast<int>(node1_edge[0].GetEdgeRankInfo().second), 0);
-    CHECK_EQ(node1_edge[0].GetGradNode(), grad_node0);
+    const paddle::small_vector<std::vector<GradSlotMeta>,
+                               egr::kSlotSmallVectorSize>& node1_metas =
+        grad_node1->OutputMeta();
+    const auto& node1_meta = node1_metas[0];
+
+    CHECK_EQ(static_cast<int>(node1_meta[0].GetEdge().GetEdgeRankInfo().first),
+             0);
+    CHECK_EQ(static_cast<int>(node1_meta[0].GetEdge().GetEdgeRankInfo().second),
+             0);
+    CHECK_EQ(node1_meta[0].GetEdge().GetGradNode(), grad_node0);
   }
 }
 
@@ -232,16 +236,19 @@ TEST(Forward, BranchedNodes) {
     // 2. TensorWrapper: No TensorWrapper for ScaleNode
     // 3. NextEdges
     // Node 1 -> Node 0
-    const std::vector<std::vector<Edge>>& node1_edges = grad_node1->GetEdges();
-    const Edge& node1_edge = node1_edges[0][0];
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        node1_metas = grad_node1->OutputMeta();
+    const Edge& node1_edge = node1_metas[0][0].GetEdge();
 
     CHECK_EQ(static_cast<int>(node1_edge.GetEdgeRankInfo().first), 0);
     CHECK_EQ(static_cast<int>(node1_edge.GetEdgeRankInfo().second), 0);
     CHECK_EQ(node1_edge.GetGradNode(), grad_node0);
 
     // Node 2 -> Node 0
-    const std::vector<std::vector<Edge>>& node2_edges = grad_node2->GetEdges();
-    const Edge& node2_edge = node2_edges[0][0];
+    const paddle::small_vector<std::vector<egr::GradSlotMeta>,
+                               egr::kSlotSmallVectorSize>& node2_metas =
+        grad_node2->OutputMeta();
+    const Edge& node2_edge = node2_metas[0][0].GetEdge();
 
     CHECK_EQ(static_cast<int>(node2_edge.GetEdgeRankInfo().first), 0);
     CHECK_EQ(static_cast<int>(node2_edge.GetEdgeRankInfo().second), 0);
diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc
index 7e64c65d8205e..72a94b40ed753 100644
--- a/paddle/fluid/eager/tests/task_tests/grad_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc
@@ -87,7 +87,7 @@ TEST(Grad, SingleNodeEmptyGrad) {
 
     // grad_node Add Edges
     std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
   std::vector<paddle::experimental::Tensor> outs = {output_tensor};
 
@@ -150,7 +150,7 @@ TEST(Grad, SingleNodeCustomGrad) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta1->SetStopGradient(false);
     std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
-    node0_ptr->AddEdges(&res, 0);
+    node0_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
@@ -207,12 +207,12 @@ TEST(Grad, LinearNodes) {
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     auto_grad_meta->SetStopGradient(false);
     // Connect Node0 -> Node1 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node1_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node1_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor, 0);
 
     AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -224,8 +224,7 @@ TEST(Grad, LinearNodes) {
     auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
 
     auto_grad_meta1->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    node1_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   // Use Empty Grad Tensor
@@ -304,20 +303,20 @@ TEST(Grad, WithAccumulation) {
     auto_grad_meta1->SetStopGradient(false);
 
     // Connect Node0 -> Node2 via Edge
-    auto meta0 = egr::AutogradMeta();
-    meta0.SetStopGradient(false);
-    meta0.SetSingleOutRankWithSlot(0, 0);
-    meta0.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res0 = {&meta0};
-    node0_ptr->AddEdges(&res0, 0);
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto* meta0 = EagerUtils::autograd_meta(&tmp_tensor0);
+    meta0->SetStopGradient(false);
+    meta0->SetSingleOutRankWithSlot(0, 0);
+    meta0->SetGradNode(node2_ptr);
+    node0_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     // Connect Node1 -> Node2 via Edge
-    auto meta1 = egr::AutogradMeta();
-    meta1.SetStopGradient(false);
-    meta1.SetSingleOutRankWithSlot(0, 0);
-    meta1.SetGradNode(node2_ptr);
-    std::vector<egr::AutogradMeta*> res1 = {&meta1};
-    node1_ptr->AddEdges(&res1, 0);
+    auto tmp_tensor1 = paddle::experimental::Tensor();
+    auto meta1 = EagerUtils::autograd_meta(&tmp_tensor1);
+    meta1->SetStopGradient(false);
+    meta1->SetSingleOutRankWithSlot(0, 0);
+    meta1->SetGradNode(node2_ptr);
+    node1_ptr->SetGradOutMeta(tmp_tensor1, 0);
 
     AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
     // Connect Tensor and AccumulationNode via AutoGradMeta
@@ -329,8 +328,7 @@ TEST(Grad, WithAccumulation) {
     auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
 
     auto_grad_meta2->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
-    node2_ptr->AddEdges(&res2, 0);
+    node2_ptr->SetGradOutMeta(leaf_tensor, 0);
   }
 
   auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index 2c53fc89f650e..855fe526c10c8 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -110,21 +110,20 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto auto_grad_meta = EagerUtils::autograd_meta(&tmp_tensor0);
 
-    auto auto_grad_meta = std::make_shared<AutogradMeta>();
-
-    auto acc_node_ptr =
-        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(auto_grad_meta);
 
     auto_grad_meta->SetStopGradient(false);
     auto_grad_meta->SetGradNode(acc_node_ptr);
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
-    scale_node_ptr->AddEdges(&res, 0);
+    std::vector<egr::AutogradMeta*> res = {auto_grad_meta};
+    scale_node_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
-            auto_grad_meta));
+            tmp_tensor0.mutable_autograd_meta()));
 
     egr_utils_api::RegisterGradientHookForTensor(
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
@@ -181,19 +180,17 @@ TEST(RetainGrad, HookAfterRetainGrad) {
   paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor();
   {
     // AccumulationNode Hook: +3
-
-    auto auto_grad_meta = std::make_shared<AutogradMeta>();
-    auto acc_node_ptr =
-        std::make_shared<GradNodeAccumulation>(auto_grad_meta.get());
+    auto tmp_tensor0 = paddle::experimental::Tensor();
+    auto auto_grad_meta = EagerUtils::autograd_meta(&tmp_tensor0);
+    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>(auto_grad_meta);
     auto_grad_meta->SetGradNode(acc_node_ptr);
     auto_grad_meta->SetStopGradient(false);
-    std::vector<egr::AutogradMeta*> res = {auto_grad_meta.get()};
-    scale_node_ptr->AddEdges(&res, 0);
+    scale_node_ptr->SetGradOutMeta(tmp_tensor0, 0);
 
     auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
     leaf_tensor.set_autograd_meta(
         std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
-            auto_grad_meta));
+            tmp_tensor0.mutable_autograd_meta()));
 
     egr_utils_api::RegisterGradientHookForTensor(
         leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 416739bbbb177..6b0a84835045c 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -69,9 +69,6 @@ inline void run_program_dygraph_function(
     grad_node->SetGradOutMeta(params, /*slot id*/ 1);
 
     grad_node->SetGradInMeta(deref_out, 0);
-    // Set Next Edges
-    grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0);
-    grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1);
 
     egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
 
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 180e18f22ea2b..fe1cdefb7d572 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -364,12 +364,16 @@ class GradNodeRunProgram : public egr::GradNodeBase {
 
   ~GradNodeRunProgram() override = default;
   // Functor: perform backward computations
-  virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
-      std::vector<std::vector<paddle::experimental::Tensor>> &grads,  // NOLINT
-      bool create_graph, bool is_new_grad) override {
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                  egr::kSlotSmallVectorSize> &grads,  // NOLINT
+             bool create_graph,
+             bool is_new_grad) override {
     VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
-    std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads =
-        GradNodeRunProgram::ApplyGradientHooks(grads);
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         egr::kSlotSmallVectorSize>
+        hooked_grads = GradNodeRunProgram::ApplyGradientHooks(grads);
     PADDLE_ENFORCE_EQ(hooked_grads.size(), 1,
                       paddle::platform::errors::InvalidArgument(
                           "The hooked_grads.size() of RunProgramGradOp should "
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 66d877f06e21d..033af5c496c98 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -441,8 +441,10 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
 }
 
 void EagerUtils::FillZeroForEmptyGradInputs(
-    std::vector<std::vector<paddle::experimental::Tensor>>* in_grads,
-    const std::vector<std::vector<GradSlotMeta>>& grad_in_metas) {
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>* in_grads,
+    const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
+        grad_in_metas) {
   for (size_t i = 0; i < in_grads->size(); i++) {
     for (size_t j = 0; j < (*in_grads)[i].size(); j++) {
       paddle::experimental::Tensor& grad = (*in_grads)[i][j];
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index 51a322c8524ac..ef2b1baac661b 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -234,8 +234,10 @@ class EagerUtils {
     * Fill Zero
     * **/
   static void FillZeroForEmptyGradInputs(
-      std::vector<std::vector<paddle::experimental::Tensor>>* out_grads,
-      const std::vector<std::vector<GradSlotMeta>>& grad_out_metas);
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>* out_grads,
+      const paddle::small_vector<std::vector<GradSlotMeta>,
+                                 kSlotSmallVectorSize>& grad_out_metas);
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc
index cf7a7c3c9f43d..2599e3232cac7 100644
--- a/paddle/fluid/framework/attribute.cc
+++ b/paddle/fluid/framework/attribute.cc
@@ -18,35 +18,37 @@ namespace paddle {
 namespace framework {
 
 paddle::any GetAttrValue(const Attribute& attr) {
-  if (attr.type() == typeid(int)) {
-    return paddle::any(BOOST_GET_CONST(int, attr));
-  } else if (attr.type() == typeid(float)) {
-    return paddle::any(BOOST_GET_CONST(float, attr));
-  } else if (attr.type() == typeid(std::string)) {
-    return paddle::any(BOOST_GET_CONST(std::string, attr));
-  } else if (attr.type() == typeid(std::vector<int>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<int>, attr));
-  } else if (attr.type() == typeid(std::vector<float>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<float>, attr));
-  } else if (attr.type() == typeid(std::vector<std::string>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<std::string>, attr));
-  } else if (attr.type() == typeid(bool)) {
-    return paddle::any(BOOST_GET_CONST(bool, attr));
-  } else if (attr.type() == typeid(std::vector<bool>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<bool>, attr));
-  } else if (attr.type() == typeid(BlockDesc*)) {
-    return paddle::any(BOOST_GET_CONST(BlockDesc*, attr));
-  } else if (attr.type() == typeid(int64_t)) {
-    return paddle::any(BOOST_GET_CONST(int64_t, attr));
-  } else if (attr.type() == typeid(std::vector<BlockDesc*>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<BlockDesc*>, attr));
-  } else if (attr.type() == typeid(std::vector<int64_t>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<int64_t>, attr));
-  } else if (attr.type() == typeid(std::vector<double>)) {
-    return paddle::any(BOOST_GET_CONST(std::vector<double>, attr));
-  } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Unsupported Attribute value type."));
+  switch (AttrTypeID(attr)) {
+    case proto::AttrType::INT:
+      return BOOST_GET_CONST(int, attr);
+    case proto::AttrType::FLOAT:
+      return BOOST_GET_CONST(float, attr);
+    case proto::AttrType::STRING:
+      return BOOST_GET_CONST(std::string, attr);
+    case proto::AttrType::INTS:
+      return BOOST_GET_CONST(std::vector<int>, attr);
+    case proto::AttrType::FLOATS:
+      return BOOST_GET_CONST(std::vector<float>, attr);
+    case proto::AttrType::STRINGS:
+      return BOOST_GET_CONST(std::vector<std::string>, attr);
+    case proto::AttrType::BOOLEAN:
+      return BOOST_GET_CONST(bool, attr);
+    case proto::AttrType::BOOLEANS:
+      return BOOST_GET_CONST(std::vector<bool>, attr);
+    case proto::AttrType::LONG:
+      return BOOST_GET_CONST(int64_t, attr);
+    case proto::AttrType::LONGS:
+      return BOOST_GET_CONST(std::vector<int64_t>, attr);
+    case proto::AttrType::FLOAT64S:
+      return BOOST_GET_CONST(std::vector<double>, attr);
+    case proto::AttrType::BLOCK:
+      return BOOST_GET_CONST(BlockDesc*, attr);
+    case proto::AttrType::BLOCKS:
+      return BOOST_GET_CONST(std::vector<BlockDesc*>, attr);
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported Attribute value type `%s` for phi.",
+          platform::demangle(attr.type().name())));
   }
 }
 
diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h
index 6c4171a5b896a..2164a21f3f892 100644
--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -242,7 +242,7 @@ class AttrReader {
     return *attr_value;
   }
 
-  inline const Attribute& GetAttr(const std::string& name) const {
+  const Attribute* GetAttr(const std::string& name) const {
     auto it = attrs_.find(name);
     bool found = it != attrs_.end();
     if (!found) {
@@ -251,11 +251,10 @@ class AttrReader {
         found = it != default_attrs_->end();
       }
     }
-    PADDLE_ENFORCE_EQ(found, true,
-                      platform::errors::NotFound(
-                          "Attribute (%s) should be in AttributeMap.", name));
-
-    return it->second;
+    if (found) {
+      return &it->second;
+    }
+    return nullptr;
   }
 
  private:
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 63e289af45209..99e786d3b0201 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -125,7 +125,6 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
 #ifdef PADDLE_WITH_MKLDNN
     tran_lod_tensor->set_mem_desc(in_lod_tensor.mem_desc());
 #endif
-    tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<phi::SelectedRows>()) {
     auto &in_selected_rows = in_var.Get<phi::SelectedRows>();
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index 6dc53c9649e9d..05215a9e5f14b 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -156,6 +156,9 @@ void DeleteUnusedTensors(const Scope &scope,
       for (auto &t : *lod_tensor_arr) {
         garbages.emplace_back(t.MoveMemoryHolder());
       }
+      // NOTE(wangxi): need clear the vector, otherwise lod_tensor_arr.size() is
+      // wrong, if size() decrease in next step, an error maybe occur.
+      lod_tensor_arr->clear();
     } else if (var->IsType<Strings>()) {
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index a8fde3f36bc6d..e7601edb0ca07 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 struct GpuPsGraphNode {
   int64_t node_id;
-  int neighbor_size, neighbor_offset;
+  unsigned int neighbor_size, neighbor_offset;
   // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
   // neighbor_size) of int64_t *neighbor_list;
 };
@@ -32,28 +32,38 @@ struct GpuPsGraphNode {
 struct GpuPsCommGraph {
   int64_t *neighbor_list;
   GpuPsGraphNode *node_list;
-  int neighbor_size, node_size;
+  unsigned int neighbor_size, node_size;
   // the size of neighbor array and graph_node_list array
   GpuPsCommGraph()
       : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
   GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
-                 int neighbor_size_, int node_size_)
+                 unsigned int neighbor_size_, unsigned int node_size_)
       : neighbor_list(neighbor_list_),
         node_list(node_list_),
         neighbor_size(neighbor_size_),
         node_size(node_size_) {}
+  void init_on_cpu(unsigned int neighbor_size, unsigned int node_size) {
+    this->neighbor_size = neighbor_size;
+    this->node_size = node_size;
+    this->neighbor_list = new int64_t[neighbor_size];
+    this->node_list = new paddle::framework::GpuPsGraphNode[node_size];
+  }
+  void release_on_cpu() {
+    delete[] neighbor_list;
+    delete[] node_list;
+  }
   void display_on_cpu() {
     VLOG(0) << "neighbor_size = " << neighbor_size;
     VLOG(0) << "node_size = " << node_size;
-    for (int i = 0; i < neighbor_size; i++) {
+    for (size_t i = 0; i < neighbor_size; i++) {
       VLOG(0) << "neighbor " << i << " " << neighbor_list[i];
     }
-    for (int i = 0; i < node_size; i++) {
+    for (size_t i = 0; i < node_size; i++) {
       VLOG(0) << "node i " << node_list[i].node_id
               << " neighbor_size = " << node_list[i].neighbor_size;
       std::string str;
       int offset = node_list[i].neighbor_offset;
-      for (int j = 0; j < node_list[i].neighbor_size; j++) {
+      for (size_t j = 0; j < node_list[i].neighbor_size; j++) {
         if (j > 0) str += ",";
         str += std::to_string(neighbor_list[j + offset]);
       }
@@ -139,12 +149,18 @@ struct NeighborSampleQuery {
 };
 struct NeighborSampleResult {
   int64_t *val;
+  int64_t *actual_val;
   int *actual_sample_size, sample_size, key_size;
+  int total_sample_size;
   std::shared_ptr<memory::Allocation> val_mem, actual_sample_size_mem;
+  std::shared_ptr<memory::Allocation> actual_val_mem;
   int64_t *get_val() { return val; }
+  int64_t get_actual_val() { return (int64_t)actual_val; }
   int *get_actual_sample_size() { return actual_sample_size; }
   int get_sample_size() { return sample_size; }
   int get_key_size() { return key_size; }
+  void set_total_sample_size(int s) { total_sample_size = s; }
+  int get_len() { return total_sample_size; }
   void initialize(int _sample_size, int _key_size, int dev_id) {
     sample_size = _sample_size;
     key_size = _key_size;
@@ -165,18 +181,30 @@ struct NeighborSampleResult {
     int *ac_size = new int[key_size];
     cudaMemcpy(ac_size, actual_sample_size, key_size * sizeof(int),
                cudaMemcpyDeviceToHost);  // 3, 1, 3
+    int total_sample_size = 0;
+    for (int i = 0; i < key_size; i++) {
+      total_sample_size += ac_size[i];
+    }
+    int64_t *res2 = new int64_t[total_sample_size];  // r
+    cudaMemcpy(res2, actual_val, total_sample_size * sizeof(int64_t),
+               cudaMemcpyDeviceToHost);  // r
 
+    int start = 0;
     for (int i = 0; i < key_size; i++) {
       VLOG(0) << "actual sample size for " << i << "th key is " << ac_size[i];
       VLOG(0) << "sampled neighbors are ";
-      std::string neighbor;
+      std::string neighbor, neighbor2;
       for (int j = 0; j < ac_size[i]; j++) {
-        if (neighbor.size() > 0) neighbor += ";";
-        neighbor += std::to_string(res[i * sample_size + j]);
+        // if (neighbor.size() > 0) neighbor += ";";
+        if (neighbor2.size() > 0) neighbor2 += ";";  // r
+        // neighbor += std::to_string(res[i * sample_size + j]);
+        neighbor2 += std::to_string(res2[start + j]);  // r
       }
-      VLOG(0) << neighbor;
+      VLOG(0) << neighbor << " " << neighbor2;
+      start += ac_size[i];  // r
     }
     delete[] res;
+    delete[] res2;  // r
     delete[] ac_size;
     VLOG(0) << " ------------------";
   }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index 7e5aa40267767..8a0088114e2ec 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -23,13 +23,18 @@
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
 namespace framework {
-class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
+class GpuPsGraphTable : public HeterComm<int64_t, unsigned int, int> {
  public:
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
-      : HeterComm<int64_t, int, int>(1, resource) {
+      : HeterComm<int64_t, unsigned int, int>(1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
     gpu_num = resource_->total_device();
+    for (int i = 0; i < gpu_num; i++) {
+      gpu_graph_list.push_back(GpuPsCommGraph());
+      sample_status.push_back(NULL);
+      tables_.push_back(NULL);
+    }
     cpu_table_status = -1;
     if (topo_aware) {
       int total_gpu = resource_->total_device();
@@ -82,6 +87,8 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
     //   end_graph_sampling();
     // }
   }
+  void build_graph_on_single_gpu(GpuPsCommGraph &g, int gpu_id);
+  void clear_graph_info(int gpu_id);
   void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
   NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
   NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index 1c59f318517d0..605019cb607fc 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <thrust/device_vector.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
 #include <functional>
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
@@ -30,10 +32,11 @@ sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 */
 
-__global__ void get_cpu_id_index(int64_t* key, int* val, int64_t* cpu_key,
-                                 int* sum, int* index, int len) {
+__global__ void get_cpu_id_index(int64_t* key, unsigned int* val,
+                                 int64_t* cpu_key, int* sum, int* index,
+                                 int len) {
   CUDA_KERNEL_LOOP(i, len) {
-    if (val[i] == -1) {
+    if (val[i] == ((unsigned int)-1)) {
       int old = atomicAdd(sum, 1);
       cpu_key[old] = key[i];
       index[old] = i;
@@ -43,9 +46,9 @@ __global__ void get_cpu_id_index(int64_t* key, int* val, int64_t* cpu_key,
 
 template <int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
-                                           int* node_index, int* actual_size,
-                                           int64_t* res, int sample_len,
-                                           int n) {
+                                           unsigned int* node_index,
+                                           int* actual_size, int64_t* res,
+                                           int sample_len, int n) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -55,7 +58,7 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   curand_init(blockIdx.x, threadIdx.y * WARP_SIZE + threadIdx.x, 0, &rng);
 
   while (i < last_idx) {
-    if (node_index[i] == -1) {
+    if (node_index[i] == (unsigned int)(-1)) {
       actual_size[i] = 0;
       i += BLOCK_WARPS;
       continue;
@@ -92,13 +95,14 @@ __global__ void neighbor_sample_example_v2(GpuPsCommGraph graph,
   }
 }
 
-__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* node_index,
+__global__ void neighbor_sample_example(GpuPsCommGraph graph,
+                                        unsigned int* node_index,
                                         int* actual_size, int64_t* res,
                                         int sample_len, int* sample_status,
                                         int n, int from) {
   int id = blockIdx.x * blockDim.y + threadIdx.y;
   if (id < n) {
-    if (node_index[id] == -1) {
+    if (node_index[id] == (unsigned int)(-1)) {
       actual_size[id] = 0;
       return;
     }
@@ -374,6 +378,18 @@ __global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
   }
 }
 
+__global__ void fill_actual_vals(int64_t* vals, int64_t* actual_vals,
+                                 int* actual_sample_size,
+                                 int* cumsum_actual_sample_size,
+                                 int sample_size, int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    for (int j = 0; j < actual_sample_size[i]; j++) {
+      actual_vals[cumsum_actual_sample_size[i] + j] = vals[sample_size * i + j];
+    }
+  }
+}
+
 __global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
                                    int64_t* res) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -382,6 +398,18 @@ __global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
   }
 }
 
+void GpuPsGraphTable::clear_graph_info(int gpu_id) {
+  if (tables_.size() && tables_[gpu_id] != NULL) {
+    delete tables_[gpu_id];
+  }
+  auto& graph = gpu_graph_list[gpu_id];
+  if (graph.neighbor_list != NULL) {
+    cudaFree(graph.neighbor_list);
+  }
+  if (graph.node_list != NULL) {
+    cudaFree(graph.node_list);
+  }
+}
 void GpuPsGraphTable::clear_graph_info() {
   if (tables_.size()) {
     for (auto table : tables_) delete table;
@@ -406,6 +434,46 @@ In this function, memory is allocated on each gpu to save the graphs,
 gpu i saves the ith graph from cpu_graph_list
 */
 
+void GpuPsGraphTable::build_graph_on_single_gpu(GpuPsCommGraph& g, int i) {
+  clear_graph_info(i);
+  platform::CUDADeviceGuard guard(resource_->dev_id(i));
+  // platform::CUDADeviceGuard guard(i);
+  gpu_graph_list[i] = GpuPsCommGraph();
+  sample_status[i] = NULL;
+  tables_[i] = new Table(std::max((unsigned int)1, g.node_size) / load_factor_);
+  if (g.node_size > 0) {
+    std::vector<int64_t> keys;
+    std::vector<unsigned int> offset;
+    cudaMalloc((void**)&gpu_graph_list[i].node_list,
+               g.node_size * sizeof(GpuPsGraphNode));
+    cudaMemcpy(gpu_graph_list[i].node_list, g.node_list,
+               g.node_size * sizeof(GpuPsGraphNode), cudaMemcpyHostToDevice);
+    for (unsigned int j = 0; j < g.node_size; j++) {
+      keys.push_back(g.node_list[j].node_id);
+      offset.push_back(j);
+    }
+    build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+    gpu_graph_list[i].node_size = g.node_size;
+  } else {
+    build_ps(i, NULL, NULL, 0, 1024, 8);
+    gpu_graph_list[i].node_list = NULL;
+    gpu_graph_list[i].node_size = 0;
+  }
+  if (g.neighbor_size) {
+    int* addr;
+    cudaMalloc((void**)&addr, g.neighbor_size * sizeof(int));
+    cudaMemset(addr, 0, g.neighbor_size * sizeof(int));
+    sample_status[i] = addr;
+    cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+               g.neighbor_size * sizeof(int64_t));
+    cudaMemcpy(gpu_graph_list[i].neighbor_list, g.neighbor_list,
+               g.neighbor_size * sizeof(int64_t), cudaMemcpyHostToDevice);
+    gpu_graph_list[i].neighbor_size = g.neighbor_size;
+  } else {
+    gpu_graph_list[i].neighbor_list = NULL;
+    gpu_graph_list[i].neighbor_size = 0;
+  }
+}
 void GpuPsGraphTable::build_graph_from_cpu(
     std::vector<GpuPsCommGraph>& cpu_graph_list) {
   VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
@@ -418,20 +486,21 @@ void GpuPsGraphTable::build_graph_from_cpu(
   for (int i = 0; i < cpu_graph_list.size(); i++) {
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // platform::CUDADeviceGuard guard(i);
-    gpu_graph_list.push_back(GpuPsCommGraph());
-    sample_status.push_back(NULL);
-    auto table =
-        new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
-    tables_.push_back(table);
+    gpu_graph_list[i] = GpuPsCommGraph();
+    sample_status[i] = NULL;
+    // auto table =
+    //     new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_[i] = new Table(
+        std::max((unsigned int)1, cpu_graph_list[i].node_size) / load_factor_);
     if (cpu_graph_list[i].node_size > 0) {
       std::vector<int64_t> keys;
-      std::vector<int> offset;
+      std::vector<unsigned int> offset;
       cudaMalloc((void**)&gpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
       cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
                  cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
                  cudaMemcpyHostToDevice);
-      for (int j = 0; j < cpu_graph_list[i].node_size; j++) {
+      for (unsigned int j = 0; j < cpu_graph_list[i].node_size; j++) {
         keys.push_back(cpu_graph_list[i].node_list[j].node_id);
         offset.push_back(j);
       }
@@ -597,15 +666,15 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     // use the key-value map to update alloc_mem_i[0,shard_len)
     // tables_[i]->rwlock_->RDLock();
     tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<int*>(node.val_storage),
+                    reinterpret_cast<unsigned int*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
     // node.in_stream);
     int shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    int* id_array = reinterpret_cast<int*>(node.val_storage);
-    int* actual_size_array = id_array + shard_len;
-    int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int* actual_size_array = (int*)(id_array + shard_len);
+    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
     int sample_grid_size = (shard_len - 1) / dim_y + 1;
     dim3 block(parallel_sample_size, dim_y);
     dim3 grid(sample_grid_size);
@@ -738,6 +807,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     if (shard_len == 0) {
       continue;
     }
+    // create_storage(gpu_id, i, shard_len * sizeof(int64_t),
+    //                shard_len * (1 + sample_size) * sizeof(int64_t));
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
                    shard_len * (1 + sample_size) * sizeof(int64_t));
   }
@@ -760,15 +831,18 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     // If not found, val is -1.
     tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
-                    reinterpret_cast<int*>(node.val_storage),
+                    reinterpret_cast<unsigned int*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
 
     auto shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
-    int* id_array = reinterpret_cast<int*>(node.val_storage);
-    int* actual_size_array = id_array + shard_len;
-    int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    // int* id_array = reinterpret_cast<int*>(node.val_storage);
+    // int* actual_size_array = id_array + shard_len;
+    // int64_t* sample_array = (int64_t*)(id_array + shard_len * 2);
+    unsigned int* id_array = reinterpret_cast<unsigned int*>(node.val_storage);
+    int* actual_size_array = (int*)(id_array + shard_len);
+    int64_t* sample_array = (int64_t*)(actual_size_array + shard_len);
     constexpr int WARP_SIZE = 32;
     constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
     constexpr int TILE_SIZE = BLOCK_WARPS * 16;
@@ -846,6 +920,28 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2(
   fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
       d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
       d_idx_ptr, sample_size, len);
+
+  {
+    platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id));
+    platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
+    thrust::device_ptr<int> t_actual_sample_size(actual_sample_size);
+    int total_sample_size =
+        thrust::reduce(t_actual_sample_size, t_actual_sample_size + len);
+    result.actual_val_mem =
+        memory::AllocShared(place, total_sample_size * sizeof(int64_t));
+    result.actual_val = (int64_t*)(result.actual_val_mem)->ptr();
+
+    result.set_total_sample_size(total_sample_size);
+
+    thrust::device_vector<int> cumsum_actual_sample_size(len);
+    thrust::exclusive_scan(t_actual_sample_size, t_actual_sample_size + len,
+                           cumsum_actual_sample_size.begin(), 0);
+    fill_actual_vals<<<grid_size, block_size_, 0, stream>>>(
+        val, result.actual_val, actual_sample_size,
+        thrust::raw_pointer_cast(cumsum_actual_sample_size.data()), sample_size,
+        len);
+  }
+
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
@@ -868,13 +964,10 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
   if (query_size <= 0) return result;
   int& actual_size = result.actual_sample_size;
   actual_size = 0;
-  result.initialize(query_size, resource_->dev_id(gpu_id));
-  int64_t* val = result.val;
   // int dev_id = resource_->dev_id(gpu_id);
   // platform::CUDADeviceGuard guard(dev_id);
-  platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id));
-  std::vector<int> idx, gpu_begin_pos, local_begin_pos, sample_size;
-  int size = 0;
+  std::vector<int> idx, gpu_begin_pos, local_begin_pos;
+  int sample_size;
   /*
   if idx[i] = a, gpu_begin_pos[i] = p1,
   gpu_local_begin_pos[i] = p2;
@@ -898,6 +991,31 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
     x2 = max(x1, x);
     return y2 - x2;
   };
+  auto graph = gpu_graph_list[gpu_id];
+  if (graph.node_size == 0) {
+    return result;
+  }
+  int x2, y2;
+  int len = range_check(start, start + query_size, 0, graph.node_size, x2, y2);
+
+  if (len == 0) {
+    return result;
+  }
+  int64_t* val;
+  sample_size = len;
+  result.initialize(len, resource_->dev_id(gpu_id));
+  actual_size = len;
+  val = result.val;
+  int dev_id_i = resource_->dev_id(gpu_id);
+  platform::CUDADeviceGuard guard(dev_id_i);
+  // platform::CUDADeviceGuard guard(i);
+  int grid_size = (len - 1) / block_size_ + 1;
+  node_query_example<<<grid_size, block_size_, 0,
+                       resource_->remote_stream(gpu_id, gpu_id)>>>(
+      gpu_graph_list[gpu_id], x2, len, (int64_t*)val);
+  cudaStreamSynchronize(resource_->remote_stream(gpu_id, gpu_id));
+  return result;
+  /*
   for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
     auto graph = gpu_graph_list[i];
     if (graph.node_size == 0) {
@@ -943,6 +1061,7 @@ NodeQueryResult GpuPsGraphTable::query_node_list(int gpu_id, int start,
     destroy_storage(gpu_id, x);
   }
   return result;
+  */
 }
 }
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index b0899b4a7f5b3..93854d7f1ec3f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -58,6 +58,11 @@ void GraphGpuWrapper::set_device(std::vector<int> ids) {
     device_id_mapping.push_back(device_id);
   }
 }
+std::vector<std::vector<int64_t>> GraphGpuWrapper::get_all_id(int type, int idx,
+                                                              int slice_num) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->get_all_id(type, idx, slice_num);
+}
 void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
                                    std::vector<std::string> &node_types) {
   id_to_edge = edge_types;
@@ -76,6 +81,32 @@ void GraphGpuWrapper::set_up_types(std::vector<std::string> &edge_types,
   this->table_feat_conf_feat_shape.resize(node_types.size());
 }
 
+void GraphGpuWrapper::make_partitions(int idx, int64_t byte_size,
+                                      int device_len) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->make_partitions(idx, byte_size, device_len);
+}
+int32_t GraphGpuWrapper::load_next_partition(int idx) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->load_next_partition(idx);
+}
+
+void GraphGpuWrapper::set_search_level(int level) {
+  ((GpuPsGraphTable *)graph_table)->cpu_graph_table->set_search_level(level);
+}
+
+std::vector<int64_t> GraphGpuWrapper::get_partition(int idx, int num) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->get_partition(idx, num);
+}
+int32_t GraphGpuWrapper::get_partition_num(int idx) {
+  return ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->get_partition_num(idx);
+}
+void GraphGpuWrapper::make_complementary_graph(int idx, int64_t byte_size) {
+  ((GpuPsGraphTable *)graph_table)
+      ->cpu_graph_table->make_complementary_graph(idx, byte_size);
+}
 void GraphGpuWrapper::load_edge_file(std::string name, std::string filepath,
                                      bool reverse) {
   // 'e' means load edge
@@ -132,10 +163,11 @@ void GraphGpuWrapper::add_table_feat_conf(std::string table_name,
   }
   VLOG(0) << "add conf over";
 }
+void GraphGpuWrapper::init_search_level(int level) { search_level = level; }
 
 void GraphGpuWrapper::init_service() {
   table_proto.set_task_pool_size(24);
-
+  table_proto.set_search_level(search_level);
   table_proto.set_table_name("cpu_graph_table");
   table_proto.set_use_cache(false);
   for (int i = 0; i < id_to_edge.size(); i++)
@@ -161,11 +193,16 @@ void GraphGpuWrapper::init_service() {
 void GraphGpuWrapper::upload_batch(int idx,
                                    std::vector<std::vector<int64_t>> &ids) {
   GpuPsGraphTable *g = (GpuPsGraphTable *)graph_table;
-  std::vector<paddle::framework::GpuPsCommGraph> vec;
+  // std::vector<paddle::framework::GpuPsCommGraph> vec;
   for (int i = 0; i < ids.size(); i++) {
-    vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]));
+    // vec.push_back(g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]));
+    GpuPsCommGraph sub_graph =
+        g->cpu_graph_table->make_gpu_ps_graph(idx, ids[i]);
+    g->build_graph_on_single_gpu(sub_graph, i);
+    sub_graph.release_on_cpu();
+    VLOG(0) << "sub graph on gpu " << i << " is built";
   }
-  g->build_graph_from_cpu(vec);
+  // g->build_graph_from_cpu(vec);
 }
 
 void GraphGpuWrapper::initialize() {
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index 6972551b896ed..b638311304773 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,7 +22,10 @@ namespace framework {
 #ifdef PADDLE_WITH_HETERPS
 class GraphGpuWrapper {
  public:
-  char* graph_table;
+  static GraphGpuWrapper* GetInstance() {
+    static GraphGpuWrapper wrapper;
+    return &wrapper;
+  }
   void initialize();
   void test();
   void set_device(std::vector<int> ids);
@@ -34,12 +37,22 @@ class GraphGpuWrapper {
                            std::string feat_dtype, int feat_shape);
   void load_edge_file(std::string name, std::string filepath, bool reverse);
   void load_node_file(std::string name, std::string filepath);
+  int32_t load_next_partition(int idx);
+  int32_t get_partition_num(int idx);
+  std::vector<int64_t> get_partition(int idx, int num);
+  void make_partitions(int idx, int64_t byte_size, int device_len);
+  void make_complementary_graph(int idx, int64_t byte_size);
+  void set_search_level(int level);
+  void init_search_level(int level);
+  std::vector<std::vector<int64_t>> get_all_id(int type, int idx,
+                                               int slice_num);
   NodeQueryResult query_node_list(int gpu_id, int start, int query_size);
   NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
                                                 bool cpu_switch);
   std::vector<int64_t> graph_neighbor_sample(int gpu_id,
                                              std::vector<int64_t>& key,
                                              int sample_size);
+
   std::unordered_map<std::string, int> edge_to_id, feature_to_id;
   std::vector<std::string> id_to_feature, id_to_edge;
   std::vector<std::unordered_map<std::string, int>> table_feat_mapping;
@@ -48,6 +61,8 @@ class GraphGpuWrapper {
   std::vector<std::vector<int>> table_feat_conf_feat_shape;
   ::paddle::distributed::GraphParameter table_proto;
   std::vector<int> device_id_mapping;
+  int search_level = 1;
+  char* graph_table;
 };
 #endif
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index fc54be447fe17..87b62c6d380a4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -298,6 +298,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
 template class HashTable<long, int>;
+template class HashTable<long, unsigned long>;
+template class HashTable<long, unsigned int>;
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
     cudaStream_t>(const unsigned long* d_keys,
@@ -308,6 +310,10 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals, size_t len,
                                                       cudaStream_t stream);
 
+template void HashTable<long, unsigned long>::get<cudaStream_t>(
+    const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
+template void HashTable<long, unsigned int>::get<cudaStream_t>(
+    const long* d_keys, unsigned int* d_vals, size_t len, cudaStream_t stream);
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -323,6 +329,14 @@ template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          size_t len,
                                                          cudaStream_t stream);
 
+template void HashTable<long, unsigned long>::insert<cudaStream_t>(
+    const long* d_keys, const unsigned long* d_vals, size_t len,
+    cudaStream_t stream);
+
+template void HashTable<long, unsigned int>::insert<cudaStream_t>(
+    const long* d_keys, const unsigned int* d_vals, size_t len,
+    cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::insert<
 //    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 51432e9de81fb..7ebf7660ee521 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -584,7 +584,7 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 
   for (int i = 0; i < total_device; ++i) {
     int shard_len = h_right[i] - h_left[i] + 1;
-    if (shard_len == 0) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
     create_storage(num, i, shard_len * sizeof(KeyType),
@@ -630,6 +630,9 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
   sync_stream(stream);
 
   for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
     destroy_storage(num, i);
   }
 }
@@ -747,6 +750,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   }
 
   for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
     destroy_storage(dev_num, i);
   }
 }
@@ -862,6 +868,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   }
 
   for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
     destroy_storage(dev_num, i);
   }
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
index f35a1c41bbe1d..b3a38a6dfde49 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -28,6 +28,16 @@ namespace platform = paddle::platform;
 // paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
 //     std::vector<int64_t> ids)
 
+std::string edges[] = {
+    std::string("0\t1"), std::string("0\t9"), std::string("1\t2"),
+    std::string("1\t0"), std::string("2\t1"), std::string("2\t3"),
+    std::string("3\t2"), std::string("3\t4"), std::string("4\t3"),
+    std::string("4\t5"), std::string("5\t4"), std::string("5\t6"),
+    std::string("6\t5"), std::string("6\t7"), std::string("7\t6"),
+    std::string("7\t8"),
+};
+char edge_file_name[] = "edges1.txt";
+
 std::string nodes[] = {
     std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
     std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
@@ -53,12 +63,17 @@ std::vector<std::string> user_feature_dtype = {"float32", "int32", "string",
 std::vector<std::string> item_feature_dtype = {"float32"};
 std::vector<int> user_feature_shape = {1, 2, 1, 1};
 std::vector<int> item_feature_shape = {1};
-void prepare_file(char file_name[]) {
+void prepare_file(char file_name[], bool load_edge) {
   std::ofstream ofile;
   ofile.open(file_name);
-
-  for (auto x : nodes) {
-    ofile << x << std::endl;
+  if (load_edge) {
+    for (auto x : edges) {
+      ofile << x << std::endl;
+    }
+  } else {
+    for (auto x : nodes) {
+      ofile << x << std::endl;
+    }
   }
   ofile.close();
 }
@@ -85,9 +100,10 @@ TEST(TEST_FLEET, test_cpu_cache) {
     g_f1->add_dtype(item_feature_dtype[i]);
     g_f1->add_shape(item_feature_shape[i]);
   }
-  prepare_file(node_file_name);
+  prepare_file(node_file_name, false);
+  prepare_file(edge_file_name, true);
   table_proto.set_shard_num(24);
-
+  table_proto.set_search_level(2);
   std::shared_ptr<HeterPsResource> resource =
       std::make_shared<HeterPsResource>(device_id_mapping);
   resource->enable_p2p();
@@ -120,11 +136,14 @@ TEST(TEST_FLEET, test_cpu_cache) {
   }
   g.cpu_graph_table->build_sampler(0);
   ids1.push_back(5);
+  ids1.push_back(7);
   vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids0));
   vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(0, ids1));
   vec[0].display_on_cpu();
   vec[1].display_on_cpu();
-  g.build_graph_from_cpu(vec);
+  // g.build_graph_from_cpu(vec);
+  g.build_graph_on_single_gpu(vec[0], 0);
+  g.build_graph_on_single_gpu(vec[1], 1);
   int64_t cpu_key[3] = {0, 1, 2};
   /*
   std::vector<std::shared_ptr<char>> buffers(3);
@@ -136,20 +155,84 @@ TEST(TEST_FLEET, test_cpu_cache) {
   }
   */
   void *key;
-  platform::CUDADeviceGuard guard(0);
-  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
-  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
-  auto neighbor_sample_res =
-      g.graph_neighbor_sample_v2(0, (int64_t *)key, 2, 3, true);
-  neighbor_sample_res.display();
-  //{1,9} or {9,1} is expected for key 0
-  //{0,2} or {2,0} is expected for key 1
-  //{1,3} or {3,1} is expected for key 2
-  auto node_query_res = g.query_node_list(0, 0, 4);
-  node_query_res.display();
-  NeighborSampleQuery query;
-  query.initialize(0, node_query_res.get_val(), 2, node_query_res.get_len());
-  query.display();
-  auto c = g.graph_neighbor_sample_v3(query, false);
-  c.display();
+  int device_len = 2;
+  for (int i = 0; i < 2; i++) {
+    // platform::CUDADeviceGuard guard(i);
+    LOG(0) << "query on card " << i;
+    //{1,9} or {9,1} is expected for key 0
+    //{0,2} or {2,0} is expected for key 1
+    //{1,3} or {3,1} is expected for key 2
+    int step = 2;
+    int cur = 0;
+    while (true) {
+      auto node_query_res = g.query_node_list(i, cur, step);
+      node_query_res.display();
+      if (node_query_res.get_len() == 0) {
+        VLOG(0) << "no more ids,break";
+        break;
+      }
+      cur += node_query_res.get_len();
+      NeighborSampleQuery query;
+      query.initialize(i, node_query_res.get_val(), 1,
+                       node_query_res.get_len());
+      query.display();
+      auto c = g.graph_neighbor_sample_v3(query, false);
+      c.display();
+    }
+  }
+  g.cpu_graph_table->set_search_level(2);
+  // g.cpu_graph_table->Load_to_ssd(edge_file_name,"e>u2u");
+  g.cpu_graph_table->Load(edge_file_name, "e>u2u");
+  g.cpu_graph_table->make_partitions(0, 64, 2);
+  int index = 0;
+  while (g.cpu_graph_table->load_next_partition(0) != -1) {
+    auto all_ids = g.cpu_graph_table->get_all_id(0, 0, device_len);
+    for (auto x : all_ids) {
+      for (auto y : x) {
+        VLOG(0) << "part " << index << " " << y;
+      }
+    }
+    for (int i = 0; i < all_ids.size(); i++) {
+      GpuPsCommGraph sub_graph =
+          g.cpu_graph_table->make_gpu_ps_graph(0, all_ids[i]);
+      g.build_graph_on_single_gpu(sub_graph, i);
+      VLOG(2) << "sub graph on gpu " << i << " is built";
+    }
+    VLOG(0) << "start to iterate gpu graph node";
+    g.cpu_graph_table->make_complementary_graph(0, 64);
+    for (int i = 0; i < 2; i++) {
+      // platform::CUDADeviceGuard guard(i);
+      LOG(0) << "query on card " << i;
+      int step = 2;
+      int cur = 0;
+      while (true) {
+        auto node_query_res = g.query_node_list(i, cur, step);
+        node_query_res.display();
+        if (node_query_res.get_len() == 0) {
+          VLOG(0) << "no more ids,break";
+          break;
+        }
+        cur += node_query_res.get_len();
+        NeighborSampleQuery query, q1;
+        query.initialize(i, node_query_res.get_val(), 4,
+                         node_query_res.get_len());
+        query.display();
+        auto c = g.graph_neighbor_sample_v3(query, true);
+        c.display();
+        platform::CUDADeviceGuard guard(i);
+        int64_t *key;
+        VLOG(0) << "sample key 1 globally";
+        g.cpu_graph_table->set_search_level(2);
+        cudaMalloc((void **)&key, sizeof(int64_t));
+        int64_t t_key = 1;
+        cudaMemcpy(key, &t_key, sizeof(int64_t), cudaMemcpyHostToDevice);
+        q1.initialize(i, (int64_t)key, 2, 1);
+        auto d = g.graph_neighbor_sample_v3(q1, true);
+        d.display();
+        cudaFree(key);
+        g.cpu_graph_table->set_search_level(1);
+      }
+    }
+    index++;
+  }
 }
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 52bfe42cc5028..64765c98fd04b 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -630,7 +630,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 #endif
 
 #ifdef PADDLE_WITH_PSCORE
-    auto& task_ptrs = device_task_ptrs[dev];
+    auto& task_ptrs = device_task_ptrs[shard_id];
 #endif
 
     int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 01e594a176bd0..2a8ffbf431ecd 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -52,8 +52,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   paddle::any Attr(const std::string& name) const override {
-    auto& attr = ctx_.Attrs().GetAttr(name);
-    return GetAttrValue(attr);
+    auto* attr = ctx_.Attrs().GetAttr(name);
+    PADDLE_ENFORCE_NOT_NULL(
+        attr, platform::errors::NotFound(
+                  "Attribute (%s) should be in AttributeMap.", name));
+    return GetAttrValue(*attr);
   }
 
   size_t InputSize(const std::string& name) const override {
@@ -450,216 +453,252 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   auto attr_reader = ctx->Attrs();
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto& attr_name = attr_names[i];
-    if (attr_defs[i].type_index == phi::AttributeType::INT_ARRAY) {
-      // When attr is a vector_tensor or tensor, transform it to IntArray
-      if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) {
-        auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name));
-        if (ctx->IsRuntime()) {
-          // If is in runtime, we will get tensor's value for IntArray
-          // and push it into attrs
-          std::vector<Variable*> vars;
-          vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); i++) {
-            vars.push_back(BOOST_GET_CONST(Variable*, infershape_inputs[i]));
+    VLOG(6) << "BuildInferMetaContext: " << attr_name << ": "
+            << attr_defs[i].type_index;
+    auto* attr_ptr = attr_reader.GetAttr(attr_name);
+    switch (attr_defs[i].type_index) {
+      case phi::AttributeType::SCALAR:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::FLOAT:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(BOOST_GET_CONST(float, attr)));
+              break;
+            case framework::proto::AttrType::INT:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(BOOST_GET_CONST(int, attr)));
+              break;
+            case framework::proto::AttrType::STRING:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::Scalar(BOOST_GET_CONST(std::string, attr)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when construct "
+                  "InferMetaContext.",
+                  attr_name));
           }
-          if (infershape_inputs.size() != 1) {
-            infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePhiIntArrayFromVarList(vars)));
+        } else if (ctx->HasInput(attr_name)) {
+          auto infershape_input = std::move(ctx->GetInputVarPtrs(attr_name));
+          if (infershape_input.size() == 1) {
+            if (ctx->IsRuntime()) {
+              Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
+              infer_meta_context.EmplaceBackAttr(
+                  std::move(experimental::MakePhiScalarFromVar(*var)));
+            } else {
+              phi::Scalar tensor_scalar(-1);
+              tensor_scalar.SetFromTensor(true);
+              infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+            }
           } else {
-            infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePhiIntArrayFromVar(*vars[0])));
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "Invalid input.size() when cast op attribute `%s` to Scalar, "
+                "expected 1, but actually is %d .",
+                attr_name, infershape_input.size()));
           }
         } else {
-          // If is not in runtime, we will set default value(-1) for IntArray
-          std::vector<VarDesc*> vars;
-          vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); ++i) {
-            vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
+          // do nothing, skip current attr
+        }
+        break;
+      case phi::AttributeType::INT_ARRAY:
+        // When attr is a vector_tensor or tensor, transform it to IntArray
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS:
+              infer_meta_context.EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+              break;
+            case framework::proto::AttrType::LONGS:
+              infer_meta_context.EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              infer_meta_context.EmplaceBackAttr(
+                  phi::IntArray({BOOST_GET_CONST(int, attr)}));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "construct InferMetaContext.",
+                  attr_name));
           }
-
-          int64_t num_ele = 0;
-          if (vars.size() == 1) {
-            num_ele = 1;
-            const auto& tensor_dims = vars[0]->GetShape();
-            for (size_t i = 0; i < tensor_dims.size(); ++i) {
-              num_ele *= tensor_dims[i];
+        } else if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) {
+          auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name));
+          if (ctx->IsRuntime()) {
+            // If is in runtime, we will get tensor's value for IntArray
+            // and push it into attrs
+            std::vector<Variable*> vars;
+            vars.reserve(infershape_inputs.size());
+            for (size_t i = 0; i < infershape_inputs.size(); i++) {
+              vars.push_back(BOOST_GET_CONST(Variable*, infershape_inputs[i]));
             }
-
-            if (num_ele <= 0) {
-              PADDLE_THROW(platform::errors::Unimplemented(
-                  "Invalid number for construct phi::IntArray, expected "
-                  "number > 0, but actually is %d. ",
-                  num_ele));
+            if (infershape_inputs.size() != 1) {
+              infer_meta_context.EmplaceBackAttr(
+                  std::move(experimental::MakePhiIntArrayFromVarList(vars)));
+            } else {
+              infer_meta_context.EmplaceBackAttr(
+                  std::move(experimental::MakePhiIntArrayFromVar(*vars[0])));
             }
-
           } else {
-            num_ele = vars.size();
+            // If is not in runtime, we will set default value(-1) for IntArray
+            std::vector<VarDesc*> vars;
+            vars.reserve(infershape_inputs.size());
+            for (size_t i = 0; i < infershape_inputs.size(); ++i) {
+              vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
+            }
+
+            int64_t num_ele = 0;
+            if (vars.size() == 1) {
+              num_ele = 1;
+              const auto& tensor_dims = vars[0]->GetShape();
+              for (size_t i = 0; i < tensor_dims.size(); ++i) {
+                num_ele *= tensor_dims[i];
+              }
+
+              if (num_ele <= 0) {
+                num_ele = tensor_dims.size();
+              }
+
+            } else {
+              num_ele = vars.size();
+            }
+            phi::IntArray tensor_attr(std::vector<int32_t>(num_ele, -1));
+            tensor_attr.SetFromTensor(true);
+            infer_meta_context.EmplaceBackAttr(std::move(tensor_attr));
           }
-          phi::IntArray tensor_attr(std::vector<int32_t>(num_ele, -1));
-          tensor_attr.SetFromTensor(true);
-          infer_meta_context.EmplaceBackAttr(std::move(tensor_attr));
-        }
-      } else if (ctx->HasAttr(attr_name)) {
-        auto& attr = attr_reader.GetAttr(attr_name);
-        if (AttrTypeID(attr) == proto::AttrType::INTS) {
-          infer_meta_context.EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
-        } else if (AttrTypeID(attr) == proto::AttrType::LONGS) {
-          infer_meta_context.EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
-        } else if (AttrTypeID(attr) == proto::AttrType::INT) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::IntArray({BOOST_GET_CONST(int, attr)}));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to IntArray when "
-              "construct InferMetaContext.",
-              attr_name));
-        }
-      }
-    } else if (attr_defs[i].type_index == phi::AttributeType::SCALAR) {
-      if (ctx->HasAttr(attr_name)) {
-        // TODO(chentianyu03): support other attrs later
-        auto& attr = attr_reader.GetAttr(attr_name);
-        if (AttrTypeID(attr) == proto::AttrType::FLOAT) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::Scalar(BOOST_GET_CONST(float, attr)));
-        } else if (AttrTypeID(attr) == proto::AttrType::STRING) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::Scalar(BOOST_GET_CONST(std::string, attr)));
-        } else if (AttrTypeID(attr) == proto::AttrType::INT) {
-          infer_meta_context.EmplaceBackAttr(
-              phi::Scalar(BOOST_GET_CONST(int, attr)));
         } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to Scalar when construct "
-              "InferMetaContext.",
-              attr_name));
+          // do nothing, skip current attr
         }
-      } else if (ctx->HasInput(attr_name)) {
-        auto infershape_input = std::move(ctx->GetInputVarPtrs(attr_name));
-        if (infershape_input.size() == 1) {
-          if (ctx->IsRuntime()) {
-            Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
-            infer_meta_context.EmplaceBackAttr(
-                std::move(experimental::MakePhiScalarFromVar(*var)));
-          } else {
-            phi::Scalar tensor_scalar(-1);
-            tensor_scalar.SetFromTensor(true);
-            infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar));
+        break;
+      case phi::AttributeType::SCALARS:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS: {
+              const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::LONGS: {
+              const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOATS: {
+              const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOAT64S: {
+              const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                  "construct KernelContext.",
+                  attr_names[i]));
           }
         } else {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Invalid input.size() when cast op attribute `%s` to Scalar, "
-              "expected 1, but actually is %d .",
-              attr_name, infershape_input.size()));
-        }
-      }
-    } else if (attr_defs[i].type_index == phi::AttributeType::SCALARS) {
-      auto& attr = attr_reader.GetAttr(attr_name);
-      if (AttrTypeID(attr) == proto::AttrType::INTS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
+          // do nothing, skip current attr
         }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == proto::AttrType::LONGS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == proto::AttrType::FLOATS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == proto::AttrType::FLOAT64S) {
-        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        infer_meta_context.EmplaceBackAttr(std::move(scalar_list));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` to vector<Scalar> when "
-            "construct InferMetaContext.",
-            attr_names[i]));
-      }
-    } else if (ctx->HasAttr(attr_name)) {
-      // Emplace Back Attr according to the type of attr.
-      auto& attr = attr_reader.GetAttr(attr_name);
-      if (attr_defs[i].type_index == phi::AttributeType::BOOL) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT32) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT64) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT32) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::STRING) {
-        infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::BOOLS) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<bool>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT32S) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<int>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT64S) {
-        if (AttrTypeID(attr) == proto::AttrType::INTS) {
-          // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
-          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
-                                                       vector_int_attr.end());
-          infer_meta_context.EmplaceBackAttr(vector_int64_attr);
-        } else {
-          infer_meta_context.EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
-        }
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT32S) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<float>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT64S) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<double>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::STRINGS) {
-        infer_meta_context.EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<std::string>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::DATA_TYPE) {
-        auto data_type = paddle::framework::TransToPhiDataType(
-            static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
-        infer_meta_context.EmplaceBackAttr(data_type);
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported attribute type is received when call "
-            "InferShapeFunctor."));
-      }
-    } else if (ctx->HasInput(attr_name)) {
-      // convert from data
-      if (attr_defs[i].type_index == phi::AttributeType::INT32) {
-        if (ctx->IsRuntime()) {
-          auto infershape_inputs = std::move(ctx->GetInputVarPtrs(attr_name));
-          auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]);
-          auto val = experimental::MakePhiScalarFromVar(*var_temp);
-          int32_t val_int = val.template to<int32_t>();
-          infer_meta_context.EmplaceBackAttr(val_int);
+        break;
+      default:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (attr_defs[i].type_index) {
+            case phi::AttributeType::FLOAT32:
+              infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+              break;
+            case phi::AttributeType::INT32:
+              infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+              break;
+            case phi::AttributeType::BOOL:
+              infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+              break;
+            case phi::AttributeType::INT64:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(int64_t, attr));
+              break;
+            case phi::AttributeType::INT32S:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<int>, attr));
+              break;
+            case phi::AttributeType::DATA_TYPE: {
+              auto data_type = paddle::framework::TransToPhiDataType(
+                  static_cast<framework::proto::VarType::Type>(
+                      BOOST_GET_CONST(int, attr)));
+              infer_meta_context.EmplaceBackAttr(data_type);
+            } break;
+            case phi::AttributeType::STRING:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::string, attr));
+              break;
+            case phi::AttributeType::INT64S:
+              switch (AttrTypeID(attr)) {
+                case framework::proto::AttrType::LONGS:
+                  infer_meta_context.EmplaceBackAttr(
+                      BOOST_GET_CONST(std::vector<int64_t>, attr));
+                  break;
+                case framework::proto::AttrType::INTS: {
+                  const auto& vector_int_attr =
+                      BOOST_GET_CONST(std::vector<int>, attr);
+                  const std::vector<int64_t> vector_int64_attr(
+                      vector_int_attr.begin(), vector_int_attr.end());
+                  infer_meta_context.EmplaceBackAttr(vector_int64_attr);
+                } break;
+                default:
+                  PADDLE_THROW(platform::errors::Unimplemented(
+                      "Unsupported cast op attribute `%s` to vector<int64_t> "
+                      "when "
+                      "construct KernelContext.",
+                      attr_names[i]));
+              }
+              break;
+            case phi::AttributeType::FLOAT32S:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<float>, attr));
+              break;
+            case phi::AttributeType::STRINGS:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<std::string>, attr));
+              break;
+            case phi::AttributeType::BOOLS:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<bool>, attr));
+              break;
+            case phi::AttributeType::FLOAT64S:
+              infer_meta_context.EmplaceBackAttr(
+                  BOOST_GET_CONST(std::vector<double>, attr));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` when construct "
+                  "KernelContext in dygraph.",
+                  attr_names[i]));
+          }
         } else {
-          infer_meta_context.EmplaceBackAttr(-1);
+          // do nothing, skip currnet attr
         }
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Get value from variable only support int yet"));
-      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 207ee713bf409..a3b49476d820f 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -159,7 +159,6 @@ if(WITH_IPU)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
     pass_library(avg_shard_pass base DIR ipu)
-    pass_library(transfer_cast_op_pass base DIR ipu)
 endif()
 
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
@@ -226,6 +225,7 @@ endif()
     cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
     cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass reshape_transpose_matmul_v2_mkldnn_fuse_pass)
     cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass matmul_v2_transpose_reshape_fuse_pass)
+    cc_test(test_shuffle_channel_mkldnn_detect_pass SRCS mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc DEPS shuffle_channel_mkldnn_detect_pass)
     cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
     cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
     cc_test(test_multi_gru_fuse_pass SRCS mkldnn/multi_gru_fuse_pass_tester.cc DEPS multi_gru_fuse_pass)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 8eb1b64a2763a..fbd8fda131b6d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2665,41 +2665,8 @@ PDNode *patterns::UnsupportedBfloat16::operator()() {
   return op;
 }
 
-PDNode *patterns::LastBfloat16Ops::operator()() {
-  auto *op = pattern->NewNode(op_repr())->assert_is_op();
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
-  op->LinksTo({op_out});
-  return op_out;
-}
-
-PDNode *patterns::FirstBfloat16Ops::operator()() {
-  auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
-
-  auto *op = pattern->NewNode(op_repr())->assert_is_op();
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-
-  op->LinksFrom({op_in});
-  return op;
-}
-
-PDNode *patterns::DuplicatedInputs::operator()() {
-  auto op = pattern->NewNode(op_repr())->assert_is_ops({"concat", "sum"});
-  op->assert_more([&](Node *node) {
-    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
-           "bfloat16";
-  });
-  return op;
-}
-
-PDNode *patterns::DuplicatedOutputs::operator()() {
-  auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"});
+PDNode *patterns::Bloat16Ops::operator()() {
+  auto op = pattern->NewNode(op_repr())->assert_is_op();
   op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
            "bfloat16";
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 434ede6cf7a3b..d7e265fe28bf9 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1565,36 +1565,9 @@ struct UnsupportedBfloat16 : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
-struct LastBfloat16Ops : public PatternBase {
-  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op);
-  PATTERN_DECL_NODE(op_out);
-};
-
-struct FirstBfloat16Ops : public PatternBase {
-  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op_in);
-  PATTERN_DECL_NODE(op);
-};
-
-struct DuplicatedInputs : public PatternBase {
-  DuplicatedInputs(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "many_inputs_op") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(op);
-};
-
-struct DuplicatedOutputs : public PatternBase {
-  DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "many_outputs_op") {}
+struct Bloat16Ops : public PatternBase {
+  Bloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "many_bfloat16_ops") {}
 
   PDNode* operator()();
 
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
index 02f000acc2a39..a6b82089dc4df 100644
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -121,9 +121,9 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
   }
 
   // Run passes
-  std::vector<std::string> graph_pass = {
-      "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass",
-      "popart_canonicalization_pass", "transfer_cast_op_pass"};
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
   std::vector<std::string> compile_pass = {
       "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
       "inference_postprocess_pass"};
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
deleted file mode 100644
index 5cd8358dc083e..0000000000000
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h"
-
-#include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-// Transfer the target dtype of Cast Op to FP16 if the original target is FP32
-// and enable FP16 mode.
-void TransferCastOpPass::ApplyImpl(ir::Graph* graph) const {
-  VLOG(10) << "enter TransferCastOpPass::ApplyImpl";
-  VLOG(10) << "Raw Graph: ";
-  VLOG(10) << DebugString(graph);
-
-  auto ipu_backend = platform::ipu::IpuBackend::GetInstance();
-  auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16;
-  auto transfer_cast_op = ipu_backend->GetIpuStrategy()->transfer_cast_op;
-  if (enable_fp16 && transfer_cast_op) {
-    for (auto* node : graph->Nodes()) {
-      if (node->IsOp() && node->Op()->Type() == "popart_cast") {
-        if (BOOST_GET_CONST(std::string, node->Op()->GetAttr("to")) ==
-            "FLOAT") {
-          node->Op()->SetAttr("to", std::string("FLOAT16"));
-        }
-      }
-    }
-  }
-
-  VLOG(10) << "Post Graph: ";
-  VLOG(10) << DebugString(graph);
-  VLOG(10) << "leave TransferCastOpPass::ApplyImpl";
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(transfer_cast_op_pass, paddle::framework::ir::TransferCastOpPass);
diff --git a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h b/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h
deleted file mode 100644
index 580fec10f2ac6..0000000000000
--- a/paddle/fluid/framework/ir/ipu/transfer_cast_op_pass.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class TransferCastOpPass : public Pass {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
index f1bd34a5ad4f6..62b2be712beef 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -22,290 +22,226 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-using string::PrettyLogDetail;
+namespace {
+class Quanter {
+ public:
+  void AddQuantOps() {
+    if (IsNotPermittedOpType()) return;
 
-void UnlinkNodes(ir::Node* a, ir::Node* b) {
-  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
-                   a->outputs.end());
-  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
-                  b->inputs.end());
-}
+    std::vector<std::string> linked_xputs;
 
-// Checking whether a reorder from FP32 to BF16 should be added before the input
-// to the operator
-bool IsPermittedInputName(const std::string& input_name) {
-  // Only the inputs listed in \"permitted_names\" requires quanitization before
-  // the bfloat16 operator. Other inputs, such as Filter and Bias are reordered
-  // in the kernel.
-  const std::vector<std::string> permitted_names = {"X", "Y", "Input",
-                                                    "ResidualData"};
-  return (std::find(permitted_names.begin(), permitted_names.end(),
-                    input_name) != permitted_names.end());
-}
+    for (const auto& logical_xput : op_xputs) {
+      std::vector<std::string> quant_xput_names;
+      quant_xput_names.reserve(xputs_map.size());
 
-// Checking whether a reorder from BF16 to FP32 should be added after the output
-// to the operator
-bool IsPermittedOutputName(const std::string& output_name) {
-  // XShape is output in transpose2 and reshape2 operators used to store the
-  // shape and lod of X. So this output do not need dequantize before.
-  return (output_name != "XShape");
-}
+      const auto& logical_xput_name = logical_xput.first;
+      if (IsNotPermittedName(logical_xput_name)) continue;
 
-void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in,
-                 int& quantize_counter) {
-  std::vector<std::string> input_names;
-
-  // Find the name of the input linking op to op_in
-  for (auto name : op->Op()->InputNames())
-    for (auto input_name : op->Op()->Input(name))
-      if (input_name == op_in->Name() && IsPermittedInputName(name))
-        input_names.push_back(name);
-
-  if (input_names.empty()) return;
-
-  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
-
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-  q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
-  q_desc.SetOutput("Output",
-                   std::vector<std::string>({quantize_out_node->Name()}));
-  q_desc.SetAttr("Scale", 1.f);
-  q_desc.SetAttr("Shift", 0.0f);
-  q_desc.SetAttr("bfloat16", true);
-  q_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                      ? op->Op()->GetAttr("data_layout")
-                                      : std::string("NCHW"));
-  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-  for (auto name = input_names.begin(); name < input_names.end(); name++)
-    op->Op()->SetInput(*name,
-                       std::vector<std::string>({quantize_out_node->Name()}));
-
-  UnlinkNodes(op_in, op);
-  IR_NODE_LINK_TO(op_in, quantize_op);
-  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
-  IR_NODE_LINK_TO(quantize_out_node, op);
-  quantize_counter++;
-}
+      const auto& physical_xputs_names = logical_xput.second;
+      for (const auto& physical_xput_name : physical_xputs_names) {
+        if (IsAlreadyLinked(linked_xputs, physical_xput_name)) continue;
 
-void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) {
-  auto inputs = op->inputs;
-  PADDLE_ENFORCE_GE(inputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s inputs(%d) must be equal or greater than 1.",
-                        op->Name(), inputs.size()));
-  PADDLE_ENFORCE_EQ(op->outputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s outputs(%d) must be equal to 1.", op->Name(),
-                        op->outputs.size()));
-
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-
-  std::vector<Node*> quantize_out_nodes(inputs.size());
-  std::vector<std::string> quantize_out_node_names(inputs.size());
-
-  for (size_t i = 0; i < inputs.size(); i++) {
-    VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-    quantize_out_nodes[i] = g->CreateVarNode(&quantize_out_desc);
-    quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
-
-    q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
-    q_desc.SetOutput("Output",
-                     std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("Scale", 1.f);
-    q_desc.SetAttr("Shift", 0.0f);
-    q_desc.SetAttr("bfloat16", true);
-    q_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                        ? op->Op()->GetAttr("data_layout")
-                                        : std::string("NCHW"));
-    auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-    UnlinkNodes(inputs[i], op);
-    IR_NODE_LINK_TO(inputs[i], quantize_op);
-    IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
-    IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-    quantize_counter++;
+        VarDesc quant_x_desc(
+            patterns::PDNodeName(get_op_type(), get_op_edge()));
+        auto quant_x_node = graph.CreateVarNode(&quant_x_desc);
+        const auto xput_name = quant_x_node->Name();
+        quant_xput_names.emplace_back(xput_name);
+
+        auto quant_op = create_quant_op(physical_xput_name, xput_name);
+
+        auto physical_xput_node = xputs_map[physical_xput_name];
+        link_nodes(physical_xput_node, quant_op, quant_x_node);
+        counter++;
+        linked_xputs.push_back(physical_xput_name);
+      }
+
+      set_edge(logical_xput_name, quant_xput_names);
+    }
   }
 
-  op->Op()->SetInput("X", quantize_out_node_names);
-}
+  int get_counter() const { return counter; }
 
-// Operators like Concat and Sum have a single input name X, which actually
-// consists of multiple inputs. Such operators require a different way to find
-// pattern and add quantize ops.
-void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(),
-                                               "duplicated_inputs"};
-  duplicated_inputs();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_inputs);
-    AddQuantizes(g, op, quantize_counter);
+  virtual ~Quanter() = default;
+
+ protected:
+  Graph& graph;
+  ir::Node* const op;
+
+  std::map<std::string, ir::Node*> xputs_map;
+  const VariableNameMap& op_xputs;
+
+  int counter = 0;
+
+  Quanter(Graph& graph, ir::Node* const op, const VariableNameMap& op_xputs)
+      : graph(graph), op(op), op_xputs(op_xputs){};
+
+  virtual bool IsNotPermittedOpType() const = 0;
+  virtual bool IsNotPermittedName(const std::string& input_name) const = 0;
+  virtual std::string get_op_type() const = 0;
+  virtual std::string get_op_edge() const = 0;
+  virtual void link_nodes(ir::Node* const physical_xput_node,
+                          ir::Node* const quant_op,
+                          ir::Node* const quant_x_node) = 0;
+  virtual void set_edge(const std::string& logical_xput_name,
+                        const std::vector<std::string>& quant_xput_names) = 0;
+
+  bool IsAlreadyLinked(const std::vector<std::string>& node_names,
+                       const std::string& node_name) const {
+    return std::find(node_names.begin(), node_names.end(), node_name) !=
+           node_names.end();
+  }
+
+  virtual ir::Node* create_quant_op(const std::string& input_name,
+                                    const std::string& output_name) const {
+    OpDesc op_desc;
+    op_desc.SetType(get_op_type());
+
+    op_desc.SetInput("Input", std::vector<std::string>({input_name}));
+    op_desc.SetOutput("Output", std::vector<std::string>({output_name}));
+    op_desc.SetAttr("Scale", 1.f);
+    op_desc.SetAttr("Shift", 0.0f);
+    op_desc.SetAttr("bfloat16", true);
+    op_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
+                                         ? op->Op()->GetAttr("data_layout")
+                                         : std::string("NCHW"));
+    return graph.CreateOpNode(&op_desc);  // OpDesc will be copied.
+  }
+
+  void UnlinkNodes(ir::Node* a, ir::Node* b) const {
+    a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                     a->outputs.end());
+    b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                    b->inputs.end());
+  }
+};
+
+class Quantizer final : public Quanter {
+ public:
+  Quantizer(Graph* const graph, ir::Node* const op)
+      : Quanter(*graph, op, op->Op()->Inputs()) {
+    auto inputs = op->inputs;
+    PADDLE_ENFORCE_GE(
+        inputs.size(), 1,
+        platform::errors::InvalidArgument(
+            "OP(%s)'s inputs(%d) must be equal or greater than 1.", op->Name(),
+            inputs.size()));
+
+    for (auto input : inputs) xputs_map[input->Name()] = input;
   };
-  gpd(graph, handler);
-}
 
-// Adding quantize ops before all operators except Concat and Sum, which have
-// already been handled in AddReoderBeforeDuplicatedInputs
-void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
-                                          "first_bfloat16_ops"};
-  bfloat16_ops();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
-    if (op->Op()->Type() != "sum" && op->Op()->Type() != "concat") {
-      AddQuantize(g, op, op_in, quantize_counter);
-    }
+ protected:
+  bool IsNotPermittedOpType() const override { return false; }
+
+  // Checking whether a reorder from FP32 to BF16
+  // should be added before the input to the operator
+  bool IsNotPermittedName(const std::string& input_name) const override {
+    // Only the inputs listed in \"permitted_names\"
+    // requires quanitization before the bfloat16 operator.
+    // Other inputs, such as Filter and Bias are reordered in the kernel.
+    const std::vector<std::string> permitted_names = {"X", "Y", "Input",
+                                                      "ResidualData"};
+
+    return std::none_of(
+        permitted_names.begin(), permitted_names.end(),
+        [&input_name](const std::string& name) { return name == input_name; });
+  }
+
+  std::string get_op_type() const override { return "quantize"; };
+  std::string get_op_edge() const override { return "out"; };
+
+  void link_nodes(ir::Node* const physical_xput_node, ir::Node* const quant_op,
+                  ir::Node* const quant_x_node) override {
+    UnlinkNodes(physical_xput_node, op);
+    IR_NODE_LINK_TO(physical_xput_node, quant_op);
+    IR_NODE_LINK_TO(quant_op, quant_x_node);
+    IR_NODE_LINK_TO(quant_x_node, op);
+  }
+
+  void set_edge(const std::string& logical_xput_name,
+                const std::vector<std::string>& quant_xput_names) override {
+    op->Op()->SetInput(logical_xput_name, quant_xput_names);
+  }
+};
+
+class DeQuantizer final : public Quanter {
+ public:
+  DeQuantizer(Graph* const graph, ir::Node* const op)
+      : Quanter(*graph, op, op->Op()->Outputs()) {
+    auto outputs = op->outputs;
+    PADDLE_ENFORCE_GE(
+        outputs.size(), 1,
+        platform::errors::InvalidArgument(
+            "OP(%s)'s outputs(%d) must be equal or greater than 1.", op->Name(),
+            outputs.size()));
+
+    for (auto output : outputs) xputs_map[output->Name()] = output;
   };
-  gpd(graph, handler);
-}
 
-void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
-  int quantize_counter = 0;
-  AddReoderBeforeDuplicatedInputs(graph, quantize_counter);
-  AddReoderBeforeSingleInputs(graph, quantize_counter);
-  PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
-                  quantize_counter);
-}
+ protected:
+  bool IsNotPermittedOpType() const override {
+    // Prior_box operator output is always FP32 so no dequantization is needed.
+    return op->Op()->Type() == "prior_box";
+  }
 
-void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out,
-                   int& dequantize_counter) {
-  if (op->Op()->Type() == "prior_box") return;
-
-  // Find the name of the output linking op to op_out
-  std::vector<std::string> output_names;
-  for (auto name : op->Op()->OutputNames())
-    for (auto output_name : op->Op()->Output(name))
-      if (output_name == op_out->Name() && IsPermittedOutputName(name))
-        output_names.push_back(name);
-
-  if (output_names.empty()) return;
-
-  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
-
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-  deq_desc.SetInput("Input",
-                    std::vector<std::string>({dequantize_in_node->Name()}));
-  deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
-  deq_desc.SetAttr("Scale", 1.0f);
-  deq_desc.SetAttr("Shift", 0.0f);
-  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-  for (auto name = output_names.begin(); name < output_names.end(); name++)
-    op->Op()->SetOutput(*name,
-                        std::vector<std::string>({dequantize_in_node->Name()}));
-
-  UnlinkNodes(op, op_out);
-  IR_NODE_LINK_TO(op, dequantize_in_node);
-  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-  IR_NODE_LINK_TO(dequantize_op, op_out);
-
-  dequantize_counter++;
-}
+  // Checking whether a reorder from BF16 to FP32
+  // should be added after the output to the operator
+  bool IsNotPermittedName(const std::string& output_name) const override {
+    // XShape is output in transpose2 and reshape2 operators used to store the
+    // shape and lod of X. So this output do not need dequantize before.
+    return (output_name == "XShape");
+  }
+
+  std::string get_op_type() const override { return "dequantize"; };
+  std::string get_op_edge() const override { return "in"; };
 
-void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) {
-  auto outputs = op->outputs;
-  PADDLE_ENFORCE_GE(outputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s outputs(%d) must be equal or greater than 1.",
-                        op->Name(), outputs.size()));
-  PADDLE_ENFORCE_EQ(op->inputs.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(),
-                        op->inputs.size()));
-
-  OpDesc deq_desc;
-  deq_desc.SetType("dequantize");
-
-  std::vector<Node*> dequantize_in_nodes(outputs.size());
-  std::vector<std::string> dequantize_in_node_names(outputs.size());
-
-  for (size_t i = 0; i < outputs.size(); i++) {
-    VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
-    dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc);
-    dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name();
-
-    deq_desc.SetInput("Input",
-                      std::vector<std::string>({dequantize_in_node_names[i]}));
-    deq_desc.SetOutput("Output",
-                       std::vector<std::string>({outputs[i]->Name()}));
-
-    deq_desc.SetAttr("Scale", 1.f);
-    deq_desc.SetAttr("Shift", 0.0f);
-    deq_desc.SetAttr("bfloat16", true);
-    deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout")
-                                          ? op->Op()->GetAttr("data_layout")
-                                          : std::string("NCHW"));
-    auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-
-    UnlinkNodes(op, outputs[i]);
-    IR_NODE_LINK_TO(op, dequantize_in_nodes[i]);
-    IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op);
-    IR_NODE_LINK_TO(dequantize_op, outputs[i]);
-
-    dequantize_counter++;
+  void link_nodes(ir::Node* const physical_xput_node, ir::Node* const quant_op,
+                  ir::Node* const quant_x_node) override {
+    UnlinkNodes(op, physical_xput_node);
+    IR_NODE_LINK_TO(quant_op, physical_xput_node);
+    IR_NODE_LINK_TO(quant_x_node, quant_op);
+    IR_NODE_LINK_TO(op, quant_x_node);
   }
 
-  op->Op()->SetOutput("Out", dequantize_in_node_names);
-}
+  void set_edge(const std::string& logical_xput_name,
+                const std::vector<std::string>& quant_xput_names) override {
+    op->Op()->SetOutput(logical_xput_name, quant_xput_names);
+  }
 
-// Operators like split have a single output name Out, which actually
-// consists of multiple outputs. Such operators require a different way to find
-// pattern and add dequantize ops.
-void AddReoderAfterDuplicatedOutputs(ir::Graph* graph,
-                                     int& dequantize_counter) {
-  GraphPatternDetector gpd;
-  patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(),
-                                                 "duplicated_outputs"};
-  duplicated_outputs();
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs);
-    AddDequantizes(g, op, dequantize_counter);
-  };
-  gpd(graph, handler);
+  ir::Node* create_quant_op(const std::string& input_name,
+                            const std::string& output_name) const override {
+    return Quanter::create_quant_op(output_name, input_name);
+  }
+};
 }
+using string::PrettyLogDetail;
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  int quantize_counter = 0;
+  int dequantize_counter = 0;
 
-// Adding dequantize ops after all operators except split, which has
-// already been handled in AddReoderAfterDuplicatedOutputs
-void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) {
   GraphPatternDetector gpd;
-  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
-                                         "last_bfloat16_ops"};
-  bfloat16_ops();
+  patterns::Bloat16Ops Bloat16Ops{gpd.mutable_pattern(), "Bloat16Ops"};
+  Bloat16Ops();
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
-    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
-    if (op->Op()->Type() != "split") {
-      AddDequantize(g, op, op_out, dequantize_counter);
-    }
+                     Graph* graph) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, Bloat16Ops);
+
+    Quantizer quantizer(graph, op);
+    quantizer.AddQuantOps();
+    quantize_counter += quantizer.get_counter();
+
+    DeQuantizer dequantizer(graph, op);
+    dequantizer.AddQuantOps();
+    dequantize_counter += dequantizer.get_counter();
   };
   gpd(graph, handler);
-}
 
-void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
-  int dequantize_counter = 0;
-  AddReoderAfterDuplicatedOutputs(graph, dequantize_counter);
-  AddReoderAfterSingleOutputs(graph, dequantize_counter);
+  PrettyLogDetail("---    added %d quantize ops before bfloat16 op",
+                  quantize_counter);
   PrettyLogDetail("---    added %d dequantize ops after bfloat16 op",
                   dequantize_counter);
 }
 
-void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
-  SetInputDataType(graph);
-  SetOutputDataType(graph);
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
index 3a7271f7ddc59..69c7ce35162ff 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
@@ -24,8 +24,6 @@ namespace ir {
 
 class CPUBFloat16Pass : public Pass {
  protected:
-  void SetInputDataType(ir::Graph* graph) const;
-  void SetOutputDataType(ir::Graph* graph) const;
   void ApplyImpl(ir::Graph* graph) const override;
 };
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index d89891ec3c857..fc7a53c4e7923 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -27,8 +27,16 @@ namespace ir {
 
 using string::PrettyLogDetail;
 
-void CPUBfloat16PlacementPass::SetMkldnnDataType(
-    ir::Graph* graph, int* bfloat16_operators) const {
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  bfloat16_operators += SetMkldnnDataType(graph);
+  bfloat16_operators -= RemoveOrphanedOperators(graph);
+  bfloat16_operators -= RemoveUnsupportedOperators(graph);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+int CPUBfloat16PlacementPass::SetMkldnnDataType(ir::Graph* graph) const {
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
   // set mkldnn_data_type to bfloat16 to all operators that are in
@@ -39,6 +47,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
                                                          "bfloat16_placement"};
   bfloat16_placement_pattern(op_types_list);
 
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
@@ -50,58 +59,58 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
     if ((op->Op()->HasAttr("mkldnn_data_type") ||
          op->Op()->HasProtoAttr("mkldnn_data_type")) &&
         !platform::HasOpINT8DataType(op->Op())) {
+      VLOG(4) << "---    marked " << op->Op()->Type()
+              << " operator to bfloat16 ";
       op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
-      (*bfloat16_operators)++;
+      detected_operators++;
     }
   };
   gpd(graph, handler);
+  return detected_operators;
 }
 
-void CPUBfloat16PlacementPass::RemoveOrphanedOperators(
-    ir::Graph* graph, int* bfloat16_operators) const {
+int CPUBfloat16PlacementPass::RemoveOrphanedOperators(ir::Graph* graph) const {
   // find orphaned bfloat16 operator that is between two float32 operators
   // revert mkldnn_data_type attr to float32
   GraphPatternDetector gpd;
   patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
                                                        "orphaned_bfloat16"};
   orphaned_bfloat16_pattern();
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
 
     op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
-    bfloat16_operators--;
+    VLOG(4) << "---  demarked " << op->Op()->Type() << " operator to bfloat16 ";
+    detected_operators++;
   };
   gpd(graph, handler);
+  return detected_operators;
 }
 
-void CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
-    ir::Graph* graph, int* bfloat16_operators) const {
+int CPUBfloat16PlacementPass::RemoveUnsupportedOperators(
+    ir::Graph* graph) const {
   // now quantize is supported FP32 only, so try to find
   // bfloat16 operator that input type is not FP32
   GraphPatternDetector gpd;
   patterns::UnsupportedBfloat16 unsupported_bfloat16_pattern{
       gpd.mutable_pattern(), "unsupported_bfloat16"};
   unsupported_bfloat16_pattern();
+  int detected_operators = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, unsupported_bfloat16_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(op, op, unsupported_bfloat16_pattern);
     if ((prev_out->Var()->GetDataType() != proto::VarType::FP32)) {
       op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
-      bfloat16_operators--;
+      VLOG(4) << "---  demarked " << op->Op()->Type()
+              << " operator to bfloat16 ";
+      detected_operators++;
     }
   };
   gpd(graph, handler);
-}
-
-void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
-  int bfloat16_operators = 0;
-  SetMkldnnDataType(graph, &bfloat16_operators);
-  RemoveOrphanedOperators(graph, &bfloat16_operators);
-  RemoveUnsupportedOperators(graph, &bfloat16_operators);
-  PrettyLogDetail("---    marked %d operators to bfloat16 ",
-                  bfloat16_operators);
+  return detected_operators;
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
index facc4c4c55221..63848298a879a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -26,14 +26,11 @@ namespace ir {
  */
 class CPUBfloat16PlacementPass : public Pass {
  protected:
-  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
-
-  void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
-
-  void RemoveUnsupportedOperators(ir::Graph* graph,
-                                  int* bfloat16_operators) const;
-
   void ApplyImpl(ir::Graph* graph) const override;
+
+  int SetMkldnnDataType(ir::Graph* graph) const;
+  int RemoveOrphanedOperators(ir::Graph* graph) const;
+  int RemoveUnsupportedOperators(ir::Graph* graph) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
new file mode 100644
index 0000000000000..fe42e8f96f851
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass_tester.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope, const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "prog_x", {1, 128, 52, 52});
+  return param_scope;
+}
+
+void MainTest() {
+  Layers layers;
+  auto prog_x = layers.data("prog_x", {1, 128, 52, 52});
+  auto first_reshape2 = layers.reshape2(prog_x, {-1, 2, 64, 52, 52}, true);
+  first_reshape2->SetShape({-1, 2, 64, 52, 52});
+  auto transpose2 = layers.transpose2(first_reshape2, {0, 2, 1, 3, 4}, true);
+  transpose2->SetShape({-1, 64, 2, 52, 52});
+  auto second_reshape2 = layers.reshape2(transpose2, {-1, 128, 52, 52}, true);
+  second_reshape2->SetShape({-1, 128, 52, 52});
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  int added_nodes = 1;    // shuffle_channel
+  int removed_nodes = 5;  // 2 * reshape, reshape_out, transpose, transpose_out
+
+  int original_nodes_num = graph->Nodes().size();
+  auto pass =
+      PassRegistry::Instance().Get("shuffle_channel_mkldnn_detect_pass");
+  graph.reset(pass->Apply(graph.release()));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(current_nodes_num,
+            original_nodes_num + added_nodes - removed_nodes);
+  EXPECT_EQ(GetNumOpNodes(graph, "reshape2"), 0);
+  EXPECT_EQ(GetNumOpNodes(graph, "transpose2"), 0);
+  EXPECT_EQ(GetNumOpNodes(graph, "shuffle_channel"), 1);
+
+  for (const auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "shuffle_channel") {
+      const auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn")));
+    }
+  }
+}
+
+TEST(ShuffleChannelOneDNNDetectPass, ShuffleChannelOneDNNDetectPassTest) {
+  MainTest();
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(shuffle_channel_mkldnn_detect_pass);
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index 61cd7ad01696e..7a83fdccc218c 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -34,7 +34,6 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
   mpi_rank_ = trainer_desc.mpi_rank();
   mpi_size_ = trainer_desc.mpi_size();
   dump_file_num_ = trainer_desc.dump_file_num();
-
   for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size();
        i++) {
     need_merge_var_names_.push_back(
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index e03277fb31799..23bd777fae1d5 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -74,11 +74,12 @@ PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
-PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
 PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #else
 PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, KPS, ALL_LAYOUT);
 #endif
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e17a5d55f1f0a..18287f0c7a4ee 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -1281,6 +1278,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(kernel_type_->place_);
   }
 
+// TODO(Liu-xiandong): Now we are using too much if-else and hard code in XPU
+// device, it's ugly, and we will refactor in the future.
+#if defined(PADDLE_WITH_XPU_KP)
+  bool use_phi_xpu_kp = false;
+#endif
+
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
@@ -1299,6 +1302,45 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       dev_ctx = pool.Get(kernel_type_->place_);
 
       pt_kernel_name = kernel_signature_->name;
+// NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
+// But the default library_type is Plain, so we need to modify the
+// library_type here, otherwise it can't work.
+#ifdef PADDLE_WITH_XPU_KP
+      if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
+        bool use_xpu_kp_kernel_rt =
+            FLAGS_run_kp_kernel &&
+            paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+        bool use_xpu_kp_kernel_debug =
+            paddle::platform::is_in_xpu_kpwhite_list(type_);
+        if (use_xpu_kp_kernel_rt) {
+          VLOG(3) << "phi xpu_kp using rt mode in static graph";
+        }
+        if (use_xpu_kp_kernel_debug) {
+          VLOG(3) << "phi xpu_kp using debug mode in static graph";
+        }
+        bool is_xpu_kp_support =
+            (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+        if (is_xpu_kp_support) {
+          auto expected_kernel_key_library_type = kernel_type_->library_type_;
+          kernel_type_->library_type_ = LibraryType::kKP;
+          VLOG(3) << "modifing XPU KP kernel in static graph: "
+                  << pt_kernel_name
+                  << ", using_kernel_key:" << *kernel_type_.get();
+          auto try_pt_kernel_key =
+              TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
+          if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
+                                                        try_pt_kernel_key)) {
+            kernel_type_->library_type_ = expected_kernel_key_library_type;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is failed " << *kernel_type_.get();
+          } else {
+            use_phi_xpu_kp = true;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is succeed " << *kernel_type_.get();
+          }
+        }
+      }
+#endif
       pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
       pt_kernel_.reset(
           new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
@@ -1314,9 +1356,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
     } else {
       pt_kernel_name = kernel_signature_->name;
-// NOTE(Liu-xiandong): The register kernel used KP have library_type[KP],
-// But the default library_type is Plain, so we need to modify the
-// library_type here, otherwise it can't work.
+// NOTE(Liu-xiandong):In my ctest, this branch do not be executed,
+// I can't understand it, it's really confusing.
+// But we still need to keep this to avoid errors.
 #ifdef PADDLE_WITH_XPU_KP
       if (paddle::platform::is_xpu_place(kernel_type_->place_)) {
         bool use_xpu_kp_kernel_rt =
@@ -1335,15 +1377,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         if (is_xpu_kp_support) {
           auto expected_kernel_key_library_type = kernel_type_->library_type_;
           kernel_type_->library_type_ = LibraryType::kKP;
-          VLOG(3) << "modifing XPU KP kernel in static graph: " << type_
+          VLOG(3) << "modifing XPU KP kernel in static graph: "
+                  << pt_kernel_name
                   << ", using_kernel_key:" << *kernel_type_.get();
           auto try_pt_kernel_key =
               TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
           if (!phi::KernelFactory::Instance().HasKernel(pt_kernel_name,
                                                         try_pt_kernel_key)) {
             kernel_type_->library_type_ = expected_kernel_key_library_type;
-            VLOG(3) << "modify XPU KP kernel in static graph: " << type_
-                    << " is failed " << *kernel_type_.get();
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is failed " << *kernel_type_.get();
+          } else {
+            use_phi_xpu_kp = true;
+            VLOG(3) << "modify XPU KP kernel in static graph: "
+                    << pt_kernel_name << " is succeed " << *kernel_type_.get();
           }
         }
       }
@@ -1360,11 +1407,25 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
             !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) ||
         paddle::platform::is_in_xpu_black_list(type_);
 #endif
+#ifdef PADDLE_WITH_XPU_KP
+    bool use_xpu_kp_kernel_rt =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+        FLAGS_run_kp_kernel &&
+        paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
+    bool use_xpu_kp_kernel_debug =
+        paddle::platform::is_xpu_place(kernel_type_->place_) &&
+        paddle::platform::is_in_xpu_kpwhite_list(type_);
+    bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
+#endif
+
     if (pt_kernel_->IsValid()
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
-        ) {
+#if defined(PADDLE_WITH_XPU_KP)
+        && (!is_xpu_unsupport || use_phi_xpu_kp)
+#endif
+            ) {
       run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1374,15 +1435,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 // we need to select the heterogeneous kernel in fluid, but the kernel
 // registered in KP use library_type[KP], we need to modify it.
 #ifdef PADDLE_WITH_XPU_KP
-      bool use_xpu_kp_kernel_rt =
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&
-          FLAGS_run_kp_kernel &&
-          paddle::platform::is_xpu_kp_support_op(type_, *kernel_type_);
-      bool use_xpu_kp_kernel_debug =
-          paddle::platform::is_xpu_place(kernel_type_->place_) &&
-          paddle::platform::is_in_xpu_kpwhite_list(type_);
-      bool is_xpu_kp_support =
-          (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
       if (is_xpu_kp_support) {
         kernel_type_->library_type_ = LibraryType::kKP;
       }
@@ -1609,7 +1661,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
       (kernel_iter == kernels.end() ||
        !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
        paddle::platform::is_in_xpu_black_list(type_))) {
-    VLOG(3) << "missing XPU kernel: " << type_
+    VLOG(3) << "fluid missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
@@ -1625,10 +1677,10 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     bool use_xpu_kp_kernel_debug =
         paddle::platform::is_in_xpu_kpwhite_list(type_);
     if (use_xpu_kp_kernel_rt) {
-      VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "fluid xpu_kp using rt mode ";
     }
     if (use_xpu_kp_kernel_debug) {
-      VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "fluid xpu_kp using debug mode ";
     }
     bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
     if (is_xpu_kp_support) {
@@ -1645,7 +1697,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
         expected_kernel_key.place_ = platform::CPUPlace();
         kernel_iter = kernels.find(expected_kernel_key);
       } else {
-        VLOG(3) << "using XPU KP kernel: " << type_
+        VLOG(3) << "fluid using XPU KP kernel: " << type_
                 << ", using_kernel_key:" << expected_kernel_key;
       }
     }
@@ -1654,7 +1706,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
          paddle::platform::is_in_xpu_black_list(type_));
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-      VLOG(3) << "missing XPU kernel: " << type_
+      VLOG(3) << "fluid missing XPU kernel: " << type_
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();
@@ -2417,163 +2469,210 @@ void OperatorWithKernel::BuildPhiKernelContext(
   VLOG(4) << "Done outputs";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    if (attr_defs[i].type_index == phi::AttributeType::INT_ARRAY) {
-      auto attr_iter = Attrs().find(attr_names[i]);
-      if (attr_iter != Attrs().end()) {  // shape is in the attribute
-        auto& attr = attr_iter->second;
-        if (AttrTypeID(attr) == proto::AttrType::LONGS) {
-          pt_kernel_context->EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
-        } else if (AttrTypeID(attr) == proto::AttrType::INTS) {
-          pt_kernel_context->EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
-        } else if (AttrTypeID(attr) == proto::AttrType::INT) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::IntArray(&BOOST_GET_CONST(int32_t, attr), 1)));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to IntArray when "
-              "construct KernelContext.",
-              attr_names[i]));
-        }
-      } else {  // shape is in the input
-        auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        if (ins_vector.size() == 1) {  // ShapeTensor
+    VLOG(6) << "BuildPhiKernelContext: " << attr_names[i] << ": "
+            << attr_defs[i].type_index;
+    auto attr_iter = Attrs().find(attr_names[i]);
+    switch (attr_defs[i].type_index) {
+      case phi::AttributeType::SCALAR:
+        if (attr_iter != Attrs().end()) {
+          // scalar is in the attribute
+          switch (AttrTypeID(attr_iter->second)) {
+            case proto::AttrType::FLOAT:
+              pt_kernel_context->EmplaceBackAttr(std::move(
+                  phi::Scalar(BOOST_GET_CONST(float, attr_iter->second))));
+              break;
+            case proto::AttrType::INT:
+              pt_kernel_context->EmplaceBackAttr(std::move(
+                  phi::Scalar(BOOST_GET_CONST(int, attr_iter->second))));
+              break;
+            case proto::AttrType::STRING:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::Scalar(
+                  BOOST_GET_CONST(std::string, attr_iter->second))));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when construct "
+                  "KernelContext in dygraph.",
+                  attr_names[i]));
+          }
+        } else {  // scalar is in the input
+          auto& ins_vector = ctx.inputs.at(attr_names[i]);
           pt_kernel_context->EmplaceBackAttr(std::move(
-              experimental::MakePhiIntArrayFromVar(*ins_vector.front())));
-        } else {  // ShapeTensorList
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(experimental::MakePhiIntArrayFromVarList(ins_vector)));
+              experimental::MakePhiScalarFromVar(*ins_vector.front())));
         }
-      }
-    } else if (attr_defs[i].type_index == phi::AttributeType::SCALAR) {
-      auto attr_iter = Attrs().find(attr_names[i]);
-      if (attr_iter != Attrs().end()) {  // scalar is in the attribute
-        auto& attr = attr_iter->second;
-        if (AttrTypeID(attr) == proto::AttrType::FLOAT) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(float, attr))));
-        } else if (AttrTypeID(attr) == proto::AttrType::STRING) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(std::string, attr))));
-        } else if (AttrTypeID(attr) == proto::AttrType::INT) {
-          pt_kernel_context->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(int, attr))));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to Scalar when construct "
-              "KernelContext.",
-              attr_names[i]));
-        }
-      } else {
-        auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        pt_kernel_context->EmplaceBackAttr(
-            std::move(experimental::MakePhiScalarFromVar(*ins_vector.front())));
-      }
-
-    } else if (attr_defs[i].type_index == phi::AttributeType::SCALARS) {
-      auto& attr = Attrs().at(attr_names[i]);
-      if (AttrTypeID(attr) == proto::AttrType::INTS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == proto::AttrType::LONGS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == proto::AttrType::FLOATS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == proto::AttrType::FLOAT64S) {
-        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
+        break;
+      case phi::AttributeType::INT_ARRAY:
+        if (attr_iter != Attrs().end()) {
+          switch (AttrTypeID(attr_iter->second)) {
+            case proto::AttrType::INTS:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second))));
+              break;
+            case proto::AttrType::LONGS:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second))));
+              break;
+            case proto::AttrType::INT:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  &BOOST_GET_CONST(int32_t, attr_iter->second), 1)));
+              break;
+            case proto::AttrType::LONG:
+              pt_kernel_context->EmplaceBackAttr(std::move(phi::IntArray(
+                  &BOOST_GET_CONST(int64_t, attr_iter->second), 1)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "construct KernelContext.",
+                  attr_names[i]));
+          }
+        } else {  // shape is in the input
+          auto& ins_vector = ctx.inputs.at(attr_names[i]);
+          if (ins_vector.size() == 1) {  // ShapeTensor
+            pt_kernel_context->EmplaceBackAttr(std::move(
+                experimental::MakePhiIntArrayFromVar(*ins_vector.front())));
+          } else {  // ShapeTensorList
+            pt_kernel_context->EmplaceBackAttr(std::move(
+                experimental::MakePhiIntArrayFromVarList(ins_vector)));
+          }
         }
-        pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` to vector<Scalar> when "
-            "construct KernelContext.",
-            attr_names[i]));
-      }
-    } else {
-      auto attr_it = attrs_.find(attr_names[i]);
-      if (attr_defs[i].type_index == phi::AttributeType::INT32) {
-        if (attr_it == attrs_.end()) {
-          auto in_it = ctx.inputs.find(attr_names[i]);
-          if (in_it != ctx.inputs.end()) {
-            // get data from input
-            auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0]));
-            int32_t val_int = val.template to<int32_t>();
-            pt_kernel_context->EmplaceBackAttr(val_int);
-          } else {
-            PADDLE_THROW(platform::errors::NotFound(
-                "can not find attribute `%s` both in attribute and input ",
+        break;
+      case phi::AttributeType::SCALARS: {
+        PADDLE_ENFORCE_NE(
+            attr_iter, Attrs().end(),
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind static KernelContext.",
+                                       attr_names[i]));
+        switch (AttrTypeID(attr_iter->second)) {
+          case proto::AttrType::INTS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::LONGS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::FLOATS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<float>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::FLOAT64S: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<double>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case proto::AttrType::BOOLEANS: {
+            const auto& vec =
+                BOOST_GET_CONST(std::vector<bool>, attr_iter->second);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            pt_kernel_context->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                "construct KernelContext.",
                 attr_names[i]));
-          }
-        } else {
-          pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(int, attr_it->second));
         }
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT32) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(float, attr_it->second));
-      } else if (attr_defs[i].type_index == phi::AttributeType::BOOL) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(bool, attr_it->second));
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT64) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(int64_t, attr_it->second));
-      } else if (attr_defs[i].type_index == phi::AttributeType::STRING) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(std::string, attr_it->second));
-      } else if (attr_defs[i].type_index == phi::AttributeType::DATA_TYPE) {
-        auto data_type = paddle::framework::TransToPhiDataType(
-            static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr_it->second)));
-        pt_kernel_context->EmplaceBackAttr(data_type);
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT64S) {
-        if (AttrTypeID(attr_it->second) == proto::AttrType::LONGS) {
-          pt_kernel_context->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr_it->second));
-        } else if (AttrTypeID(attr_it->second) == proto::AttrType::INTS) {
-          // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr =
-              BOOST_GET_CONST(std::vector<int>, attr_it->second);
-          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
-                                                       vector_int_attr.end());
-          pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
+      } break;
+      default: {
+        PADDLE_ENFORCE_NE(
+            attr_iter, Attrs().end(),
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind static KernelContext.",
+                                       attr_names[i]));
+        switch (attr_defs[i].type_index) {
+          case phi::AttributeType::FLOAT32:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(float, attr_iter->second));
+            break;
+          case phi::AttributeType::INT32:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(int, attr_iter->second));
+            break;
+          case phi::AttributeType::BOOL:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(bool, attr_iter->second));
+            break;
+          case phi::AttributeType::INT64:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(int64_t, attr_iter->second));
+            break;
+          case phi::AttributeType::INT32S:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<int>, attr_iter->second));
+            break;
+          case phi::AttributeType::DATA_TYPE: {
+            auto data_type = framework::TransToPhiDataType(
+                static_cast<framework::proto::VarType::Type>(
+                    BOOST_GET_CONST(int, attr_iter->second)));
+            pt_kernel_context->EmplaceBackAttr(data_type);
+          } break;
+          case phi::AttributeType::STRING:
+            pt_kernel_context->EmplaceBackAttr(
+                std::move(BOOST_GET_CONST(std::string, attr_iter->second)));
+            break;
+          case phi::AttributeType::INT64S:
+            switch (AttrTypeID(attr_iter->second)) {
+              case proto::AttrType::LONGS:
+                pt_kernel_context->EmplaceBackAttr(
+                    BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second));
+                break;
+              case proto::AttrType::INTS: {
+                const auto& vector_int_attr =
+                    BOOST_GET_CONST(std::vector<int>, attr_iter->second);
+                const std::vector<int64_t> vector_int64_attr(
+                    vector_int_attr.begin(), vector_int_attr.end());
+                pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
+              } break;
+              default:
+                PADDLE_THROW(platform::errors::Unimplemented(
+                    "Unsupported cast op attribute `%s` to vector<int64_t> "
+                    "when "
+                    "construct KernelContext.",
+                    attr_names[i]));
+            }
+            break;
+          case phi::AttributeType::FLOAT32S:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<float>, attr_iter->second));
+            break;
+          case phi::AttributeType::STRINGS:
+            pt_kernel_context->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<std::string>, attr_iter->second));
+            break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` when construct "
+                "KernelContext in dygraph.",
+                attr_names[i]));
         }
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT32S) {
-        const auto& vector_int_attr =
-            BOOST_GET_CONST(std::vector<int>, attr_it->second);
-        pt_kernel_context->EmplaceBackAttr(vector_int_attr);
-      } else if (attr_defs[i].type_index == phi::AttributeType::STRINGS) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<std::string>, attr_it->second));
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT32S) {
-        pt_kernel_context->EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<float>, attr_it->second));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` when construct "
-            "KernelContext.",
-            attr_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 7d60b7d26f3fb..3f6863d642cc8 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -220,6 +220,7 @@ inline bool NeedCast(const std::shared_ptr<VarType>& var) {
       paddle::platform::is_cuda_pinned_place(place) ||
       paddle::platform::is_xpu_place(place) ||
       paddle::platform::is_mlu_place(place) ||
+      paddle::platform::is_custom_place(place) ||
       paddle::platform::is_npu_place(place) ||
       paddle::platform::is_npu_pinned_place(place)) {
     // CudaPinndePlace is added for varbase created by dataloader
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index bf69f6cf5ac9d..38180ba963c38 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -233,14 +233,18 @@ PreparedOp PrepareImpl(
         auto expected_kernel_key_library_type =
             expected_kernel_key.library_type_;
         expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
-        VLOG(3) << "modifing XPU KP kernel: " << op.Type()
+        VLOG(3) << "modifing XPU KP kernel: " << pt_kernel_name
                 << ", using_kernel_key:" << expected_kernel_key;
+
         phi::KernelKey try_pt_kernel_key =
             TransOpKernelTypeToPhiKernelKey(expected_kernel_key);
         if (!phi_kernel_factory.HasKernel(pt_kernel_name, try_pt_kernel_key)) {
           expected_kernel_key.library_type_ = expected_kernel_key_library_type;
-          VLOG(3) << "modify XPU KP kernel: " << op.Type() << " is failed "
-                  << expected_kernel_key;
+          VLOG(3) << "modify XPU KP kernel: " << pt_kernel_name
+                  << " in dynamic graph is failed " << expected_kernel_key;
+        } else {
+          VLOG(3) << "modify XPU KP kernel: " << pt_kernel_name
+                  << " in dynamic graph is succeed " << expected_kernel_key;
         }
       }
     }
@@ -332,7 +336,7 @@ PreparedOp PrepareImpl(
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
       (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
+    VLOG(3) << "fluid missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
     expected_kernel_key.place_ = platform::CPUPlace();
@@ -343,20 +347,20 @@ PreparedOp PrepareImpl(
 #ifdef PADDLE_WITH_XPU_KP
   if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
     if (use_xpu_kp_kernel_rt) {
-      VLOG(3) << "xpu_kp using rt mode ";
+      VLOG(3) << "fluid xpu_kp using rt mode ";
     }
     if (use_xpu_kp_kernel_debug) {
-      VLOG(3) << "xpu_kp using debug mode ";
+      VLOG(3) << "fluid xpu_kp using debug mode ";
     }
     if (is_xpu_kp_support) {
       expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP;
       kernel_iter = kernels.find(expected_kernel_key);
-      VLOG(3) << "using XPU KP kernel: " << op.Type()
+      VLOG(3) << "using fluid XPU KP kernel: " << op.Type()
               << ", using_kernel_key:" << expected_kernel_key;
     }
     if (!is_xpu_kp_support &&
         (kernel_iter == kernels.end() || is_xpu_unsupport)) {
-      VLOG(3) << "missing XPU kernel: " << op.Type()
+      VLOG(3) << "fluid missing XPU kernel: " << op.Type()
               << ", expected_kernel_key:" << expected_kernel_key
               << ", fallbacking to CPU one!";
       expected_kernel_key.place_ = platform::CPUPlace();
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 9e729fee69d86..129f75e75de1e 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -220,7 +220,7 @@ class PreparedOp {
   static const phi::DefaultKernelSignatureMap& default_phi_kernel_sig_map;
 };
 
-const inline framework::Attribute& GetAttr(
+const inline framework::Attribute* GetAttr(
     const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs, const std::string& name) {
   auto it = attrs.find(name);
@@ -229,10 +229,10 @@ const inline framework::Attribute& GetAttr(
     it = default_attrs.find(name);
     found = it != default_attrs.end();
   }
-  PADDLE_ENFORCE_EQ(
-      found, true,
-      platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
-  return it->second;
+  if (found) {
+    return &it->second;
+  }
+  return nullptr;
 }
 
 template <typename VarType>
@@ -330,6 +330,7 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(6) << "BuildDygraphPhiKernelContext: Inputs parsing completed.";
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
@@ -380,178 +381,217 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature,
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(6) << "BuildDygraphPhiKernelContext: Outputs parsing completed.";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
-    if (attr_defs[i].type_index == phi::AttributeType::INT_ARRAY) {
-      if (attrs.find(attr_names[i]) !=
-          attrs.end()) {  // shape is in the attribute
-        auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-        if (AttrTypeID(attr) == framework::proto::AttrType::LONGS) {
-          kernel_ctx->EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
-        } else if (AttrTypeID(attr) == framework::proto::AttrType::INTS) {
+    VLOG(6) << "BuildDygraphPhiKernelContext: " << attr_names[i] << ": "
+            << attr_defs[i].type_index;
+    auto* attr_ptr = GetAttr(attrs, default_attrs, attr_names[i]);
+    switch (attr_defs[i].type_index) {
+      case phi::AttributeType::SCALAR:
+        if (attr_ptr) {
+          // scalar is in the attribute
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::FLOAT:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(BOOST_GET_CONST(float, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(BOOST_GET_CONST(int, attr))));
+              break;
+            case framework::proto::AttrType::STRING:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::Scalar(BOOST_GET_CONST(std::string, attr))));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when construct "
+                  "KernelContext in dygraph.",
+                  attr_names[i]));
+          }
+        } else {  // scalar is in the input
+          auto& ins_vector = ins.at(attr_names[i]);
           kernel_ctx->EmplaceBackAttr(std::move(
-              phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
-        } else if (AttrTypeID(attr) == framework::proto::AttrType::LONG) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::IntArray(&BOOST_GET_CONST(int64_t, attr), 1)));
-        } else if (AttrTypeID(attr) == framework::proto::AttrType::INT) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::IntArray(&BOOST_GET_CONST(int32_t, attr), 1)));
-        } else if (attr_defs[i].type_index == phi::AttributeType::INT32S) {
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
-          kernel_ctx->EmplaceBackAttr(vector_int_attr);
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to VectorTensor when "
-              "construct KernelContext.",
-              attr_names[i]));
+              experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
         }
-      } else {  // shape is in the input
-        auto& ins_vector = ins.at(attr_names[i]);
-        if (ins_vector.size() == 1) {  // ShapeTensor
-          kernel_ctx->EmplaceBackAttr(std::move(
-              experimental::MakePhiIntArrayFromVar(ins_vector[0]->Var())));
-        } else {  // ShapeTensorList
-          std::vector<framework::Variable*> variables;
-          variables.reserve(ins_vector.size());
-          for (const auto& var_base : ins_vector) {
-            variables.push_back(var_base->MutableVar());
+        break;
+      case phi::AttributeType::INT_ARRAY:
+        if (attr_ptr) {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS:
+              kernel_ctx->EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+              break;
+            case framework::proto::AttrType::LONGS:
+              kernel_ctx->EmplaceBackAttr(std::move(
+                  phi::IntArray(BOOST_GET_CONST(std::vector<int64_t>, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::IntArray(&BOOST_GET_CONST(int32_t, attr), 1)));
+              break;
+            case framework::proto::AttrType::LONG:
+              kernel_ctx->EmplaceBackAttr(
+                  std::move(phi::IntArray(&BOOST_GET_CONST(int64_t, attr), 1)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "construct KernelContext.",
+                  attr_names[i]));
+          }
+        } else {  // shape is in the input
+          auto& ins_vector = ins.at(attr_names[i]);
+          if (ins_vector.size() == 1) {  // ShapeTensor
+            kernel_ctx->EmplaceBackAttr(std::move(
+                experimental::MakePhiIntArrayFromVar(ins_vector[0]->Var())));
+          } else {  // ShapeTensorList
+            std::vector<framework::Variable*> variables;
+            variables.reserve(ins_vector.size());
+            for (const auto& var_base : ins_vector) {
+              variables.push_back(var_base->MutableVar());
+            }
+            kernel_ctx->EmplaceBackAttr(
+                std::move(experimental::MakePhiIntArrayFromVarList(variables)));
           }
-          kernel_ctx->EmplaceBackAttr(
-              std::move(experimental::MakePhiIntArrayFromVarList(variables)));
-        }
-      }
-    } else if (attr_defs[i].type_index == phi::AttributeType::SCALAR) {
-      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
-      // attribtue type by attr_defs
-      if (attrs.find(attr_names[i]) != attrs.end() ||
-          default_attrs.find(attr_names[i]) !=
-              default_attrs.end()) {  // scalar is in the attribute
-        auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-        if (AttrTypeID(attr) == framework::proto::AttrType::FLOAT) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(float, attr))));
-        } else if (AttrTypeID(attr) == framework::proto::AttrType::STRING) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(std::string, attr))));
-        } else if (AttrTypeID(attr) == framework::proto::AttrType::INT) {
-          kernel_ctx->EmplaceBackAttr(
-              std::move(phi::Scalar(BOOST_GET_CONST(int, attr))));
-        } else {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "Unsupported cast op attribute `%s` to Scalar when construct "
-              "KernelContext in dygraph.",
-              attr_names[i]));
-        }
-      } else {  // scalar is in the input
-        auto& ins_vector = ins.at(attr_names[i]);
-        kernel_ctx->EmplaceBackAttr(std::move(
-            experimental::MakePhiScalarFromVar(ins_vector[0]->Var())));
-      }
-
-    } else if (ins.find(attr_names[i]) != ins.end()) {
-      // deal tensor attr here
-      auto& ins_vector = ins.at(attr_names[i]);
-      auto tensor_attr =
-          experimental::MakePhiScalarFromVar(ins_vector[0]->Var());
-      if (attr_defs[i].type_index == phi::AttributeType::INT32) {
-        int val = tensor_attr.template to<int>();
-        kernel_ctx->EmplaceBackAttr(val);
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented("only support int here"));
-      }
-    } else if (attr_defs[i].type_index == phi::AttributeType::SCALARS) {
-      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-      if (AttrTypeID(attr) == framework::proto::AttrType::INTS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == framework::proto::AttrType::LONGS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == framework::proto::AttrType::FLOATS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
-        }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == framework::proto::AttrType::FLOAT64S) {
-        const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
         }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else if (AttrTypeID(attr) == framework::proto::AttrType::BOOLEANS) {
-        const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
-        std::vector<phi::Scalar> scalar_list;
-        scalar_list.reserve(vec.size());
-        for (const auto& val : vec) {
-          scalar_list.emplace_back(val);
+        break;
+      case phi::AttributeType::SCALARS: {
+        PADDLE_ENFORCE_NOT_NULL(
+            attr_ptr,
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind dygraph KernelContext.",
+                                       attr_names[i]));
+        auto& attr = *attr_ptr;
+        switch (AttrTypeID(attr)) {
+          case framework::proto::AttrType::INTS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<int32_t>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::LONGS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<int64_t>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::FLOATS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<float>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::FLOAT64S: {
+            const auto& vec = BOOST_GET_CONST(std::vector<double>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          case framework::proto::AttrType::BOOLEANS: {
+            const auto& vec = BOOST_GET_CONST(std::vector<bool>, attr);
+            std::vector<phi::Scalar> scalar_list;
+            scalar_list.reserve(vec.size());
+            for (const auto& val : vec) {
+              scalar_list.emplace_back(val);
+            }
+            kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
+          } break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                "construct KernelContext.",
+                attr_names[i]));
         }
-        kernel_ctx->EmplaceBackAttr(std::move(scalar_list));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` to vector<Scalar> when "
-            "construct KernelContext.",
-            attr_names[i]));
-      }
-    } else {
-      auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-      if (attr_defs[i].type_index == phi::AttributeType::INT32) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT32) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::BOOL) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT64) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::STRING) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::DATA_TYPE) {
-        auto data_type = framework::TransToPhiDataType(
-            static_cast<framework::proto::VarType::Type>(
-                BOOST_GET_CONST(int, attr)));
-        kernel_ctx->EmplaceBackAttr(data_type);
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT64S) {
-        if (AttrTypeID(attr) == framework::proto::AttrType::LONGS) {
-          kernel_ctx->EmplaceBackAttr(
-              BOOST_GET_CONST(std::vector<int64_t>, attr));
-        } else if (AttrTypeID(attr) == framework::proto::AttrType::INTS) {
-          // Emplace Back Attr according to the type of Phi_Kernel args.
-          const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
-          const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
-                                                       vector_int_attr.end());
-          kernel_ctx->EmplaceBackAttr(vector_int64_attr);
+      } break;
+      default: {
+        PADDLE_ENFORCE_NOT_NULL(
+            attr_ptr,
+            platform::errors::NotFound("(%s) is not found in AttributeMap when "
+                                       "buildind dygraph KernelContext.",
+                                       attr_names[i]));
+        auto& attr = *attr_ptr;
+        switch (attr_defs[i].type_index) {
+          case phi::AttributeType::FLOAT32:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+            break;
+          case phi::AttributeType::INT32:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+            break;
+          case phi::AttributeType::BOOL:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+            break;
+          case phi::AttributeType::INT64:
+            kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr));
+            break;
+          case phi::AttributeType::INT32S:
+            kernel_ctx->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<int>, attr));
+            break;
+          case phi::AttributeType::DATA_TYPE: {
+            auto data_type = framework::TransToPhiDataType(
+                static_cast<framework::proto::VarType::Type>(
+                    BOOST_GET_CONST(int, attr)));
+            kernel_ctx->EmplaceBackAttr(data_type);
+          } break;
+          case phi::AttributeType::STRING:
+            kernel_ctx->EmplaceBackAttr(
+                std::move(BOOST_GET_CONST(std::string, attr)));
+            break;
+          case phi::AttributeType::INT64S: {
+            switch (AttrTypeID(attr)) {
+              case framework::proto::AttrType::LONGS:
+                kernel_ctx->EmplaceBackAttr(
+                    BOOST_GET_CONST(std::vector<int64_t>, attr));
+                break;
+              case framework::proto::AttrType::INTS: {
+                const auto& vector_int_attr =
+                    BOOST_GET_CONST(std::vector<int>, attr);
+                const std::vector<int64_t> vector_int64_attr(
+                    vector_int_attr.begin(), vector_int_attr.end());
+                kernel_ctx->EmplaceBackAttr(vector_int64_attr);
+              } break;
+              default:
+                PADDLE_THROW(platform::errors::Unimplemented(
+                    "Unsupported cast op attribute `%s` to vector<int64_t> "
+                    "when "
+                    "construct KernelContext.",
+                    attr_names[i]));
+            }
+          } break;
+          case phi::AttributeType::FLOAT32S:
+            kernel_ctx->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<float>, attr));
+            break;
+          case phi::AttributeType::STRINGS:
+            kernel_ctx->EmplaceBackAttr(
+                BOOST_GET_CONST(std::vector<std::string>, attr));
+            break;
+          default:
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported cast op attribute `%s` when construct "
+                "KernelContext in dygraph.",
+                attr_names[i]));
         }
-      } else if (attr_defs[i].type_index == phi::AttributeType::INT32S) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<int>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::STRINGS) {
-        kernel_ctx->EmplaceBackAttr(
-            BOOST_GET_CONST(std::vector<std::string>, attr));
-      } else if (attr_defs[i].type_index == phi::AttributeType::FLOAT32S) {
-        kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector<float>, attr));
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported cast op attribute `%s` when construct "
-            "KernelContext in dygraph.",
-            attr_names[i]));
       }
     }
   }
+  VLOG(6) << "BuildDygraphPhiKernelContext: Attributes parsing completed.";
 }
 
 template <typename VarType>
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 6c31b025507f8..7b274339e3cbe 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -220,30 +220,34 @@ void Tracer::TraceOpImpl(const std::string& type,
       attr_checker == nullptr ? empty_attrs_map
                               : attr_checker->GetDefaultAttrMap();
 
-  NameVarMap<VarType> new_ins = ins;
+  std::unique_ptr<NameVarMap<VarType>> ins_amp = nullptr;
   if (amp_level_ == AmpLevel::O1) {
     if (amp_dtype_ == phi::DataType::FLOAT16) {
       const auto& tracer = imperative::GetCurrentTracer();
-      new_ins =
-          imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs, tracer);
       VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
-      new_ins = AutoCastInputs<VarType>(type, new_ins);
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          AutoCastInputs<VarType>(type, imperative::AutoTuneLayout<VarType>(
+                                            type, ins, outs, &attrs, tracer)));
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
-      new_ins = AutoCastBF16Inputs<VarType>(type, ins);
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          AutoCastBF16Inputs<VarType>(type, ins));
     }
   } else if (amp_level_ == AmpLevel::O2) {
     if (amp_dtype_ == phi::DataType::FLOAT16) {
       const auto& tracer = imperative::GetCurrentTracer();
-      new_ins =
-          imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs, tracer);
       VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
-      new_ins = CastPureFp16Inputs<VarType>(type, new_ins);
+      ins_amp =
+          std::make_unique<NameVarMap<VarType>>(CastPureFp16Inputs<VarType>(
+              type, imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs,
+                                                        tracer)));
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
-      new_ins = CastPureBf16Inputs<VarType>(type, ins);
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          CastPureBf16Inputs<VarType>(type, ins));
     }
   }
+  const auto& new_ins = ins_amp == nullptr ? ins : *ins_amp;
 
   try {
     if (platform::is_gpu_place(place)) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 015f4471a0246..4f0d4a908380f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -48,6 +48,7 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/utils/string/split.h"
 
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
@@ -1641,7 +1642,9 @@ AnalysisPredictor::~AnalysisPredictor() {
     StatisticShapeRangeInfo();
   }
 
-  memory::Release(place_);
+  if (place_.GetType() != phi::AllocationType::UNDEFINED) {
+    memory::Release(place_);
+  }
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 5e1a9b85ff586..0c68acfe98047 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -674,8 +674,39 @@ void Tensor::ORTCopyFromCpu(const T *data) {
                               OrtMemTypeDefault);
   size_t size = std::accumulate(begin(shape_), end(shape_), 1UL,
                                 std::multiplies<size_t>());
-  auto ort_value = GetOrtVaule(memory_info, const_cast<T *>(data), size,
-                               shape_.data(), shape_.size());
+  size_t buffer_size = size * sizeof(T);
+  if (buffer_size > buffer_.size()) {
+    buffer_.resize(buffer_size);
+  }
+  std::memcpy(static_cast<void *>(buffer_.data()), data, buffer_size);
+
+  auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  if (std::is_same<T, float>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  } else if (std::is_same<T, double>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+  } else if (std::is_same<T, int64_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  } else if (std::is_same<T, int32_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
+  } else if (std::is_same<T, uint8_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
+  } else if (std::is_same<T, int8_t>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
+  } else if (std::is_same<T, float16>::value) {
+    onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+  }
+
+  if (onnx_dtype == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Found undefined data type for onnxruntime, only supports "
+        "float16/float32/float64/int8/uint8/int32/int64."));
+  }
+
+  auto ort_value =
+      Ort::Value::CreateTensor(memory_info, buffer_.data(), buffer_size,
+                               shape_.data(), shape_.size(), onnx_dtype);
+
   binding->BindInput(name_.c_str(), ort_value);
 }
 
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 6f99ed6e25a28..3cd2df3aef639 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -187,6 +187,7 @@ class PD_INFER_DECL Tensor {
 #ifdef PADDLE_WITH_ONNXRUNTIME
   bool is_ort_tensor_{false};
   std::vector<int64_t> shape_;
+  std::vector<int8_t> buffer_;
   std::weak_ptr<Ort::IoBinding> binding_;
   int idx_{-1};
 
diff --git a/paddle/fluid/memory/cuda_managed_memory_test.cu b/paddle/fluid/memory/cuda_managed_memory_test.cu
index f8c9ff82f5712..f4b4294b5bdbf 100644
--- a/paddle/fluid/memory/cuda_managed_memory_test.cu
+++ b/paddle/fluid/memory/cuda_managed_memory_test.cu
@@ -107,7 +107,7 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
   uint64_t available_mem = platform::GpuAvailableMemToAlloc();
   uint64_t n_data = available_mem * 2 / sizeof(int) +
                     1;  // requires more than 2 * available_mem bytes
-  uint64_t step = 1024;
+  uint64_t step = std::max(n_data / 1024, static_cast<uint64_t>(1));
   AllocationPtr data_allocation =
       Alloc(platform::CUDAPlace(0), n_data * sizeof(int));
   AllocationPtr sum_allocation = Alloc(platform::CUDAPlace(0), sizeof(int));
@@ -115,8 +115,8 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) {
   int* sum = static_cast<int*>(sum_allocation->ptr());
   (*sum) = 0;
 
-  write_kernel<<<5120, 1024>>>(data, n_data, step);
-  sum_kernel<<<5120, 1024>>>(data, n_data, step, sum);
+  write_kernel<<<1, 1024>>>(data, n_data, step);
+  sum_kernel<<<1, 1024>>>(data, n_data, step, sum);
 
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index f644d2f5875da..0906567dbf6c1 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -107,7 +107,7 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
     break
 
 #define MEMORY_STAT_FUNC(item, id, func, ...)                         \
-  do {                                                                \
+  [&] {                                                               \
     paddle::memory::StatBase* stat = nullptr;                         \
     switch (id) {                                                     \
       MEMORY_STAT_FUNC_SWITHCH_CASE(item, 0);                         \
@@ -133,8 +133,8 @@ void StatUpdate(const std::string& stat_type, int dev_id, int64_t increment);
             id));                                                     \
         break;                                                        \
     }                                                                 \
-    stat->func(__VA_ARGS__);                                          \
-  } while (0)
+    return stat->func(__VA_ARGS__);                                   \
+  }()
 
 #define MEMORY_STAT_CURRENT_VALUE(item, id) \
   MEMORY_STAT_FUNC(item, id, GetCurrentValue)
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index b974f606720b2..8354650df0237 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -68,6 +68,18 @@ class UpdateLossScalingOp : public framework::OperatorWithKernel {
 
     return framework::OpKernelType(dtype, ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+#ifndef PADDLE_WITH_XPU
+    if (var_name == "FoundInfinite" || var_name == "StopUpdate") {
+      return expected_kernel_type;
+    }
+#endif
+    return framework::OperatorWithKernel::GetKernelTypeForVar(
+        var_name, tensor, expected_kernel_type);
+  }
 };
 
 class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -93,6 +105,10 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
     AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
     AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddOutput("StopUpdate",
+              "(Tensor) 1-dim tensor. Stop updating loss scaling, and just "
+              "zero inputs. It has higher priority than Attr(stop_update).")
+        .AsDispensable();
     AddAttr<int>("incr_every_n_steps",
                  "A value represents increasing loss scaling every n "
                  "consecutive steps with finite gradients.");
@@ -131,8 +147,8 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite.
   }
 };
 
-template <typename T>
-class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
  public:
   void operator()(const platform::CPUDeviceContext& ctx,
                   const bool* found_inf_data, const T* pre_loss_scaling_data,
@@ -141,6 +157,10 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
                   const int decr_every_n_nan_or_inf, const float incr_ratio,
                   const float decr_ratio, T* updated_loss_scaling_data,
                   int* good_out_data, int* bad_out_data) const {
+    PADDLE_ENFORCE_EQ(
+        IsFoundInfOnCPU, true,
+        platform::errors::InvalidArgument(
+            "The Input(FoundInfinite) should be on the CPUPlace."));
     Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
               incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
               decr_ratio, updated_loss_scaling_data, good_out_data,
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index 6d9cd96a3fb9a..43f8f84578c70 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -21,9 +21,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename FoundNanInfFlagT>
 __global__ void GpuUpdateLossScaling(
-    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const FoundNanInfFlagT found_inf_data, const T* pre_loss_scaling_data,
     const int* good_in_data, const int* bad_in_data,
     const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
     const float incr_ratio, const float decr_ratio,
@@ -70,8 +70,9 @@ __global__ void FusedFillIf(T** outs, const size_t xs_size,
   }
 }
 
-template <typename T>
-class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+template <typename T, bool IsFoundInfOnCPU>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T,
+                               IsFoundInfOnCPU> {
  public:
   void operator()(const platform::CUDADeviceContext& dev_ctx,
                   const bool* found_inf_data, const T* pre_loss_scaling_data,
@@ -80,10 +81,17 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
                   const int decr_every_n_nan_or_inf, const float incr_ratio,
                   const float decr_ratio, T* updated_loss_scaling_data,
                   int* good_out_data, int* bad_out_data) const {
-    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
-        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
-        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
-        updated_loss_scaling_data, good_out_data, bad_out_data);
+    if (IsFoundInfOnCPU) {
+      GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+          *found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+          incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+          updated_loss_scaling_data, good_out_data, bad_out_data);
+    } else {
+      GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+          found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+          incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+          updated_loss_scaling_data, good_out_data, bad_out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
index d6eddd36a4551..41eb94247f593 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,8 +41,16 @@ inline HOSTDEVICE bool check_finite(T value) {
 #endif
 }
 
-template <typename T>
-inline HOSTDEVICE void Update(const bool* found_inf_data,
+inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) {
+  return found_nan_inf_data;
+}
+
+inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) {
+  return *found_nan_inf_data;
+}
+
+template <typename T, typename FoundInfFlagT>
+inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data,
                               const T* pre_loss_scaling_data,
                               const int* good_in_data, const int* bad_in_data,
                               const int incr_every_n_steps,
@@ -49,7 +58,7 @@ inline HOSTDEVICE void Update(const bool* found_inf_data,
                               const float incr_ratio, const float decr_ratio,
                               T* updated_loss_scaling_data, int* good_out_data,
                               int* bad_out_data) {
-  if (*found_inf_data) {
+  if (IsFoundNanInf(found_inf_data)) {
     *good_out_data = 0;
     *bad_out_data = *bad_in_data + 1;
     if (*bad_out_data == decr_every_n_nan_or_inf) {
@@ -72,7 +81,7 @@ inline HOSTDEVICE void Update(const bool* found_inf_data,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename DeviceContext, typename T, bool IsFoundInfOnCPU>
 class UpdateLossScalingFunctor {
  public:
   void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
@@ -106,9 +115,33 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "FoundInfinite must has only one element."));
     const bool* found_inf_data = found_inf->data<bool>();
+    bool is_found_inf_on_cpu = platform::is_cpu_place(found_inf->place());
+
+    if (is_found_inf_on_cpu) {
+      if (*found_inf_data) {
+        phi::funcs::SetConstant<DeviceContext, T> set_constant;
+        for (auto* out : outs) {
+          out->mutable_data<T>(dev_ctx.GetPlace());
+          set_constant(dev_ctx, out, static_cast<T>(0));
+        }
+      }
+    } else {
+      LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+    }
 
-    LazyZeros<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
-    const bool stop_update = ctx.Attr<bool>("stop_update");
+    const auto* stop_update_tensor = ctx.Input<Tensor>("StopUpdate");
+    bool stop_update = false;
+    if (stop_update_tensor && stop_update_tensor->IsInitialized()) {
+      if (platform::is_cpu_place(stop_update_tensor->place())) {
+        stop_update = stop_update_tensor->data<bool>()[0];
+      } else {
+        framework::Tensor tmp_tensor;
+        framework::TensorCopySync(*stop_update_tensor, platform::CPUPlace(),
+                                  &tmp_tensor);
+        stop_update = tmp_tensor.data<bool>()[0];
+      }
+    }
+    stop_update |= ctx.Attr<bool>("stop_update");
     if (stop_update) {
       return;
     }
@@ -133,10 +166,17 @@ class UpdateLossScalingKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
-        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
-        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
-        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    if (is_found_inf_on_cpu) {
+      UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
+          dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+          bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+          decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    } else {
+      UpdateLossScalingFunctor<DeviceContext, MPDType, false>{}(
+          dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+          bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+          decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 1393da7dd57a7..5808841333f08 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -131,7 +131,8 @@ void Update(const platform::NPUDeviceContext& ctx,
 }
 
 template <typename T>
-class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
+class UpdateLossScalingFunctor<platform::NPUDeviceContext, T,
+                               /*IsFoundInfOnCPU=*/true> {
  public:
   void operator()(const platform::NPUDeviceContext& dev_ctx,
                   const std::vector<bool> found_inf_vec,
@@ -236,7 +237,7 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
         ctx.Attr<int>("decr_every_n_nan_or_inf");
     const float incr_ratio = ctx.Attr<float>("incr_ratio");
     const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
+    UpdateLossScalingFunctor<DeviceContext, MPDType, true>{}(
         dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
         incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
         updated_loss_scaling, good_out, bad_out);
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 784385d79bd3e..96b27a833fba3 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
+
 namespace paddle {
 namespace operators {
 
@@ -42,6 +45,20 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
 
     gpuStream_t stream = nullptr;
     auto place = ctx.GetPlace();
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup *pg = map->get(rid);
+      std::vector<phi::DenseTensor> out_tensor;
+      auto out_shape = ctx.Attr<std::vector<int>>("out_shape");
+      auto out = ctx.Output<framework::LoDTensor>("Out");
+      auto out_dims = out->dims();
+      out->mutable_data<T>(out_dims, place);
+
+      out_tensor.emplace_back(*out);
+      auto task = pg->Recv(out_tensor, peer);
+      return;
+    }
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index 6a2244b91025a..c31f1210f0422 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -35,6 +37,15 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
         platform::ToHCCLDataType(framework::TransToProtoVarType(out->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(ring_id)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(ring_id);
+      std::vector<phi::DenseTensor> out_tensor;
+      out_tensor.emplace_back(*out);
+      auto task = pg->Recv(out_tensor, 0);
+      return;
+    }
     auto place = ctx.GetPlace();
     auto comm =
         paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 3e565d1b975bc..add352306fa28 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -39,6 +41,16 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         peer, 0,
         platform::errors::InvalidArgument(
             "The peer (%d) for send_v2 op must be non-negative.", peer));
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(rid);
+      std::vector<phi::DenseTensor> in_tensor;
+      auto x = ctx.Input<framework::LoDTensor>("X");
+      in_tensor.push_back(*x);
+      auto task = pg->Send(in_tensor, peer);
+      return;
+    }
     gpuStream_t stream = nullptr;
     auto place = ctx.GetPlace();
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 3bc5487371bac..2d7382f3dfd70 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -34,6 +36,16 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
         platform::ToHCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(ring_id)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(ring_id);
+      std::vector<phi::DenseTensor> in_tensor;
+      auto x = ctx.Input<framework::LoDTensor>("X");
+      in_tensor.push_back(*x);
+      auto task = pg->Send(in_tensor, 1);
+      return;
+    }
     auto place = ctx.GetPlace();
     auto comm =
         paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc
index 8cd8d94d6b389..8fdde1ccdc058 100644
--- a/paddle/fluid/operators/einsum_op.cc
+++ b/paddle/fluid/operators/einsum_op.cc
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/phi/core/ddim.h"
-#include "paddle/phi/kernels/impl/einsum_impl.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -85,7 +85,7 @@ class EinsumGradMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 
 DECLARE_INFER_SHAPE_FUNCTOR(einsum, EinsumInferShapeFunctor,
-                            PD_INFER_META(phi::EinsumInferShape));
+                            PD_INFER_META(phi::EinsumInferMeta));
 
 REGISTER_OPERATOR(einsum, ops::EinsumOp, ops::EinsumOpMaker,
                   EinsumInferShapeFunctor,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index b3ac3606eaf8e..c5adee547bdac 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
 #include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -33,6 +34,7 @@ namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
 USE_OP_ITSELF(batch_norm);
+PD_DECLARE_KERNEL(batch_norm, GPU, ALL_LAYOUT);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index a80f590aa495d..884fca2c1b0b8 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -164,6 +164,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
   attrs.insert({"groups", groups});
   attrs.insert({"exhaustive_search", exhaustive_search});
   attrs.insert({"use_addto", use_addto});
+  attrs.insert({"workspace_size_MB", 512});
 
   auto op = framework::OpRegistry::CreateOp(
       "conv2d_grad", {{"Input", {"Input"}},
@@ -408,7 +409,7 @@ TEST(CudnnNormConvFp16, K1S1) {
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() < 70) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -434,7 +435,7 @@ TEST(CudnnNormConvFp16, K3S1) {
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() < 70) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -460,7 +461,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() < 70) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 6bf3a7114f4ce..0fe76fa23a637 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -43,9 +43,17 @@ inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
     const platform::CUDADeviceContext &ctx, const uint32_t rows,
     const uint32_t cols, const int vec_size) {
   const uint32_t tmp_cols = cols / vec_size;
-  int threads = std::max(
-      static_cast<uint32_t>(32),
-      std::min(tmp_cols, static_cast<uint32_t>(ctx.GetMaxThreadsPerBlock())));
+  // NOTE(wangxi): We set max_block_size to 512, for `FusedResidualDropoutBias`
+  // needs too many register resources. If data_type is float16, CUDA
+  // error(701) will occur when block_size is 1024. Which error is
+  // 'cudaErrorLaunchOutOfResources', this indicates that a launch did not
+  // occur because it did not have appropriate resources.
+  // Of course, this kernel can be optimized later to reduce the use
+  // of registers.
+  int threads =
+      std::max(static_cast<uint32_t>(32),
+               std::min(tmp_cols, static_cast<uint32_t>(std::min(
+                                      ctx.GetMaxThreadsPerBlock(), 512))));
   const auto blocks_x =
       std::max(static_cast<uint32_t>(1), (tmp_cols + threads - 1) / threads);
   const auto blocks_y = std::max(static_cast<uint32_t>(1), rows);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index d53a24a57e3cc..aa613dd3f5ce0 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -156,9 +156,9 @@ __global__ void FusedLayernormResidualDropoutBias(
 }
 
 /*
-* @brief layernorm(residual + dropout(x));
+ * @brief layernorm(residual + dropout(x));
  * Conditions:
- * (1) The number of cols is 1024;
+ * (1) The number of cols is 768/1024/4096;
  * (2) layer_norm scale and bias is not null;
  * (3) linear bias is null;
  * @param
@@ -166,6 +166,7 @@ __global__ void FusedLayernormResidualDropoutBias(
  * cols: 1024
  * x_: [rows, cols], inputs
  * residual_:[rows, cols]
+ * bias_: [cols], linear bias, can be null
  * gamma_: [cols]: layernorm scale, not null
  * beta_: [cols], layernorm bias, not null
  * mask_out_: [rows, cols], dropout result
@@ -173,7 +174,7 @@ __global__ void FusedLayernormResidualDropoutBias(
  * y_: [rows, cols], layernorm result
  * mean_out_: [rows]: layernorm means
  * var_out_: [rows]: layernorm vars
-*/
+ */
 template <
     typename T, typename U, typename ScaleT = U, typename MaskType = uint8_t,
     int VecSize = 8, int WARPS_M = 4, int WARPS_N = 1, int BYTES_PER_LDG = 16,
@@ -182,14 +183,16 @@ template <
     int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
     int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
     int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
-__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
+__global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
     int rows, int cols, uint64_t seed, const float dropout_prob,
     const bool is_upscale_in_train, const bool is_test,
     const uint64_t increment, const float epsilon, const T *__restrict__ x_ptr,
-    const T *__restrict__ residual_ptr, const ScaleT *__restrict__ gamma_ptr,
-    const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr,
-    U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr,
-    T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) {
+    const T *__restrict__ residual_ptr, const T *__restrict__ bias_ptr,
+    const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr,
+    MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr,
+    U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr,
+    T *__restrict__ y_ptr) {
+  __shared__ U smem[WARPS_M * WARPS_N];
   using Vec = phi::AlignedVector<T, VecSize>;
   using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
   using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
@@ -204,12 +207,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
   const int c = warp_n * THREADS_PER_WARP + lane;  // lane
   const int r = bidx * ROWS_PER_CTA + warp_m;      // row id
 
-  int idx = r * LN_NUM_COLS + c;
+  int idx = r * ELTS_PER_ROW + c;
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx, increment, &state);
 
   T factor = GetFactor<T>(dropout_prob, is_upscale_in_train, is_test);
 
+  // bias
+  Vec bias[LDGS];
+  if (bias_ptr != nullptr) {
+#pragma unroll
+    for (int it = 0, col = c; it < LDGS; it++) {
+      phi::Load<T, VecSize>(bias_ptr + col * VecSize, &bias[it]);
+      col += THREADS_PER_ROW;
+    }
+  }
+
   Vec_scale gamma[LDGS];
   Vec_scale beta[LDGS];
 #pragma unroll
@@ -219,14 +232,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     col += THREADS_PER_ROW;
   }
 
-  constexpr U rn = 1.f / U(LN_NUM_COLS);
+  constexpr U rn = 1.f / U(ELTS_PER_ROW);
   for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
     Vec x[LDGS];
     Vec residual[LDGS];
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]);
-      phi::Load<T, VecSize>(residual_ptr + row * LN_NUM_COLS + col * VecSize,
+      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
+      phi::Load<T, VecSize>(residual_ptr + row * ELTS_PER_ROW + col * VecSize,
                             &residual[it]);
       col += THREADS_PER_ROW;
     }
@@ -255,14 +268,28 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
     // 4 * 8
     U xf[LDGS * VecSize];
+    if (bias_ptr != nullptr) {
 #pragma unroll
-    for (int it = 0; it < LDGS; it++) {
+      for (int it = 0; it < LDGS; it++) {
 #pragma unroll
-      for (int jt = 0; jt < VecSize; jt++) {
-        // dropout(x) + residual
-        x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
-                    residual[it][jt];
-        xf[it * VecSize + jt] = U(x[it][jt]);
+        for (int jt = 0; jt < VecSize; jt++) {
+          // dropout(x) + residual
+          x[it][jt] = (x[it][jt] + bias[it][jt]) *
+                          static_cast<T>(mask_vec[it][jt]) * factor +
+                      residual[it][jt];
+          xf[it * VecSize + jt] = U(x[it][jt]);
+        }
+      }
+    } else {
+#pragma unroll
+      for (int it = 0; it < LDGS; it++) {
+#pragma unroll
+        for (int jt = 0; jt < VecSize; jt++) {
+          // dropout(x) + residual
+          x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
+                      residual[it][jt];
+          xf[it * VecSize + jt] = U(x[it][jt]);
+        }
       }
     }
 
@@ -270,9 +297,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
       phi::Store<T, VecSize>(
-          x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize);
+          x[it], residual_out_ptr + row * ELTS_PER_ROW + col * VecSize);
       phi::Store<MaskType, VecSize>(
-          mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize);
+          mask_vec[it], mask_out_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
 
@@ -289,6 +316,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       mu_local += __shfl_xor_sync(uint32_t(-1), mu_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = mu_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        mu_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          mu_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = mu_local;
+      }
+      __syncthreads();
+      mu_local = smem[warp_m];
+    }
     mu_local *= rn;
     if (lane == 0) {
       mean_out_ptr[row] = mu_local;
@@ -308,6 +351,22 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
     for (int it = 1; it < THREADS_PER_WARP; it *= 2) {
       var_local += __shfl_xor_sync(uint32_t(-1), var_local, it);
     }
+    if (WARPS_N > 1) {
+      if (lane == 0) {
+        smem[warp_m * WARPS_N + warp_n] = var_local;
+      }
+      __syncthreads();
+      if (tidx == 0) {
+        var_local = 0.f;
+#pragma unroll
+        for (int it = 0; it < WARPS_N; ++it) {
+          var_local += smem[warp_m * WARPS_N + it];
+        }
+        smem[warp_m] = var_local;
+      }
+      __syncthreads();
+      var_local = smem[warp_m];
+    }
     U rsigma = rsqrtf(var_local * rn + epsilon);
     if (lane == 0) {
       // Note: the stored var is different for paddle(ln) and apex (fast ln).
@@ -332,7 +391,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel(
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize);
+      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
       col += THREADS_PER_ROW;
     }
   }
@@ -390,12 +449,37 @@ void LaunchLayernormResidualDropoutBias(
     return;
   }
 
-  bool can_call_1024_kernel = false;
-  if (cols == 1024 && scale != nullptr && layernorm_bias != nullptr &&
-      bias == nullptr) {
-    can_call_1024_kernel = true;
+#define LAUNCH_FUSED_FAST_LN_KERNEL_BASE(cols)                                \
+  case (cols): {                                                              \
+    constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                  \
+    constexpr int WARPS_M = 4 / WARPS_N;                                      \
+    const int THREADS_PER_WARP = 32;                                          \
+    const int BYTES_PER_LDG = 16;                                             \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                            \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;         \
+    const int ROWS_PER_CTA = WARPS_M;                                         \
+    const int grid =                                                          \
+        static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
+    fused_fast_ln_fwd_kernel<                                                 \
+        T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,     \
+        VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG,                             \
+        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(                    \
+        rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,         \
+        increment, epsilon, src, residual, bias, scale, layernorm_bias,       \
+        mask_data, mean, var, dst, layernorm_dst);                            \
+  } break
+
+#define LAUNCH_FUSED_FAST_LN_KERNEL       \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(768);  \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1024); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096)
+
+  bool can_call_fast_ln_kernel = false;
+  if ((cols == 768 || cols == 1024 || cols == 4096) && scale != nullptr &&
+      layernorm_bias != nullptr) {
+    can_call_fast_ln_kernel = true;
   }
-  VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel;
+  VLOG(6) << "can_call_fast_ln_kernel = " << can_call_fast_ln_kernel;
 
   const int VecSize = MAX_CACHE_BYTES / sizeof(T);
   if (cols % VecSize != 0) {
@@ -407,26 +491,15 @@ void LaunchLayernormResidualDropoutBias(
         epsilon, src, residual, bias, scale, layernorm_bias, mask_data, dst,
         layernorm_dst, mean, var);
   } else {
-    if (can_call_1024_kernel) {
-      const int WARPS_M = 4;
-      const int WARPS_N = 1;
-      const int THREADS_PER_WARP = 32;
-      const int BYTES_PER_LDG = 16;
-      const int VecSize = BYTES_PER_LDG / sizeof(T);
-
-      const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;
-      const int ROWS_PER_CTA = WARPS_M;
-
-      // Note: the grid can not exceed max_grid of the gpu.
-      const int grid =
-          static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA)));
-      fused_ln_fwd_1024_kernel<
-          T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, uint8_t,
-          VecSize, WARPS_M, WARPS_N,
-          BYTES_PER_LDG><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(
-          rows, cols, seed, dropout_prob, is_upscale_in_train, is_test,
-          increment, epsilon, src, residual, scale, layernorm_bias, mask_data,
-          mean, var, dst, layernorm_dst);
+    if (can_call_fast_ln_kernel) {
+      switch (cols) {
+        LAUNCH_FUSED_FAST_LN_KERNEL;
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only when column is equal to 768/1024/4096 is supported for "
+              "now"));
+          break;
+      }
     } else {
       int blockDim = GetDesiredBlockDim(cols / VecSize);
       FusedLayernormResidualDropoutBias<
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index f4a5319a68caa..e38ac9a0ad2da 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -534,6 +534,8 @@ template <typename T, int Dh, int THREADS_PER_KEY, int THREADS_PER_VALUE,
           int THREADS_PER_BLOCK>
 __global__ void masked_multihead_attention_kernel(
     Masked_multihead_attention_params<T> params) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+
   static_assert(Dh % THREADS_PER_KEY == 0, "");
   static_assert(Dh % THREADS_PER_VALUE == 0, "");
 
@@ -821,6 +823,9 @@ __global__ void masked_multihead_attention_kernel(
     printf("\n");
   }
 #endif
+#else
+  assert(false);
+#endif
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 5dff5e2225f4f..caceac1228e0a 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -147,6 +147,7 @@ struct TestFusedResidualDropoutBias {
                  dropout_prob, is_upscale_in_train, is_test);
     }
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     // add residual
     for (int i = 0; i < rows; i++) {
       for (int j = 0; j < cols; j++) {
@@ -186,6 +187,7 @@ struct TestFusedResidualDropoutBias {
         src.data<T>(), residual.data<T>(), bias_ptr, mask.data<uint8_t>(),
         out.data<T>(), *ctx);
     ctx->Wait();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
   }
 
   void FusedBackward() {
@@ -313,3 +315,20 @@ TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShape) {
   test.CheckOut(static_cast<float>(1e-5));
   test.CheckGrad(static_cast<float>(1e-3));
 }
+
+TEST(FusedDropout, GPUFusedResidualDropoutBiasLargeShapeFp16) {
+  // Used to test that `cudaErrorLaunchOutOfResources` will not occur
+  int rows = 1;
+  int cols = 12288;
+  if (std::getenv("_rows") != nullptr) {
+    rows = atoi(std::getenv("_rows"));
+  }
+  if (std::getenv("_cols") != nullptr) {
+    cols = atoi(std::getenv("_cols"));
+  }
+  TestFusedResidualDropoutBias<platform::float16> test(rows, cols, 0, 0.0, true,
+                                                       true);
+  test.Run();
+  test.CheckOut(static_cast<platform::float16>(1e-1));
+  test.CheckGrad(static_cast<platform::float16>(1e-1));
+}
diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
index 8f98a0b9fbee8..5b499b8985f4f 100644
--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -29,18 +29,10 @@ class LayerNormMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT<
                          const dnnl::engine engine, platform::Place cpu_place)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::layer_normalization_forward>(
             engine, cpu_place) {
-    if (!is_test) {
-      // TODO(grygielski) Delete forcing stats_md after DNNL 1.2 is introduced
-      auto stats_md = dnnl::memory::desc(
-          {begin(dims), end(dims) - 1}, platform::MKLDNNGetDataType<float>(),
-          platform::GetPlainMKLDNNFormat(dims.size() - 1));
-      this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
-                                              x->mem_desc(), stats_md, epsilon,
-                                              flags);
-    } else {
-      this->AcquireForwardPrimitiveDescriptor(
-          dnnl::prop_kind::forward_inference, x->mem_desc(), epsilon, flags);
-    }
+    const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
+                                       : dnnl::prop_kind::forward_training;
+    this->AcquireForwardPrimitiveDescriptor(fwd_prop_kind, x->mem_desc(),
+                                            epsilon, flags);
   }
 
   std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(const Tensor* scale,
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 8225dc8e07d6a..36e54d741a04b 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -12,168 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
 
 #include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/multiary.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const {
-    auto input_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
-        var_name == "SkipUpdate") {
-      return expected_kernel_type;
-    } else {
-      return framework::OpKernelType(expected_kernel_type.data_type_,
-                                     tensor.place(), tensor.layout());
-    }
-  }
-};
-
-class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment1", "(Tensor) Input first moment");
-    AddInput("Moment2", "(Tensor) Input second moment");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-
-    AddInput("Beta1Tensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as beta1, this has a higher priority than attr(beta1), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("Beta2Tensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as beta2, this has a higher priority than attr(beta2), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("EpsilonTensor",
-             "(Tensor<float32>, optional) If provided, Adam will use this "
-             "as epsilon, this has a higher priority than attr(epsilon), the "
-             "shape of this tensor MUST BE [1].")
-        .AsDispensable();
-    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-    AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
-        .AsDispensable();
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable();
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddAttr<bool>(
-        "lazy_mode",
-        "(bool, default false) "
-        "only update the parameter that has gradient in sparse update")
-        .SetDefault(false);
-    AddAttr<int64_t>("min_row_size_to_use_multithread",
-                     "(int64_t, default 0) "
-                     "when not zero, if param row size is larger then "
-                     "min_row_size_to_use_multithread and "
-                     "inner_op_parallelism is larger then 0, sparse update "
-                     "will run in multithread mode")
-        .SetDefault(1000);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
-    // as dispensable since they are not used when use_global_beta_pow is true.
-    AddAttr<bool>("use_global_beta_pow",
-                  "(bool, default false) "
-                  "Whether to use global beta_pow for whole model instead of "
-                  "creating beta_pow for each parameter.")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-
-class AdamWOp : public AdamOp {
-  using AdamOp::AdamOp;
-};
-
-class AdamWOpMaker : public AdamOpMaker {
- public:
-  void Make() {
-    AdamOpMaker::Make();
-    AddAttr<float>("lr_ratio",
-                   "(float, default 1.0) "
-                   "layerwise learning rate decay")
-        .SetDefault(1.0f);
-    AddAttr<float>("coeff",
-                   "(float, default 0.01) "
-                   "coeff of the weight decay")
-        .SetDefault(0.01f);
-    AddAttr<bool>("with_decay",
-                  "(bool, default false) "
-                  "whether to do weight decay")
-        .SetDefault(false);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 
 DECLARE_INFER_SHAPE_FUNCTOR(adam, AdamInferMetaFunctor,
@@ -185,14 +30,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     AdamInferMetaFunctor);
 
-DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor,
-                            PD_INFER_META(phi::AdamwInferMeta));
-REGISTER_OPERATOR(
-    adamw, ops::AdamWOp, ops::AdamWOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AdamwInferMetaFunctor);
-
 REGISTER_OP_VERSION(adam)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
new file mode 100644
index 0000000000000..31feaa8102e7a
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const {
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
+        var_name == "SkipUpdate") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddInput("Beta1Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta1, this has a higher priority than attr(beta1), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("Beta2Tensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as beta2, this has a higher priority than attr(beta2), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("EpsilonTensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as epsilon, this has a higher priority than attr(epsilon), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
+    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
+    AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
+        .AsDispensable();
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable();
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddAttr<bool>(
+        "lazy_mode",
+        "(bool, default false) "
+        "only update the parameter that has gradient in sparse update")
+        .SetDefault(false);
+    AddAttr<int64_t>("min_row_size_to_use_multithread",
+                     "(int64_t, default 0) "
+                     "when not zero, if param row size is larger then "
+                     "min_row_size_to_use_multithread and "
+                     "inner_op_parallelism is larger then 0, sparse update "
+                     "will run in multithread mode")
+        .SetDefault(1000);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
+    // as dispensable since they are not used when use_global_beta_pow is true.
+    AddAttr<bool>("use_global_beta_pow",
+                  "(bool, default false) "
+                  "Whether to use global beta_pow for whole model instead of "
+                  "creating beta_pow for each parameter.")
+        .SetDefault(false);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc
new file mode 100644
index 0000000000000..e2670625d4e50
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adamw_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamWOp : public AdamOp {
+  using AdamOp::AdamOp;
+};
+
+class AdamWOpMaker : public AdamOpMaker {
+ public:
+  void Make() {
+    AdamOpMaker::Make();
+    AddAttr<float>("lr_ratio",
+                   "(float, default 1.0) "
+                   "layerwise learning rate decay")
+        .SetDefault(1.0f);
+    AddAttr<float>("coeff",
+                   "(float, default 0.01) "
+                   "coeff of the weight decay")
+        .SetDefault(0.01f);
+    AddAttr<bool>("with_decay",
+                  "(bool, default false) "
+                  "whether to do weight decay")
+        .SetDefault(false);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor,
+                            PD_INFER_META(phi::AdamwInferMeta));
+REGISTER_OPERATOR(
+    adamw, ops::AdamWOp, ops::AdamWOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AdamwInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 161483c3420fc..0159e250d317e 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -100,6 +100,10 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddOutput("FP16FusedParamOut", "The updated FP16FusedParam.")
         .AsDispensable();
+    AddOutput("FP32AccFusedGrad", "The accumulated FP32 gradients.")
+        .AsDispensable();
+    AddOutput("FP16AccFusedGrad", "The accumulated FP16 gradients.")
+        .AsDispensable();
 
     AddOutput("Moment1Out", "The updated Moment1.");
     AddOutput("Moment2Out", "The updated Moment2.");
@@ -110,8 +114,14 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
 
     AddOutput("FoundInf", "Whether there is NaN/Inf");
+    AddOutput("AccStep", "The training steps.").AsDispensable();
+    AddOutput("StopUpdate",
+              "Whether the parameter updating is stopped when the gradient "
+              "accumulated steps is less than Attr(acc_steps).")
+        .AsDispensable();
     AddOutput("Step", "The global step which excludes the NaN/Inf step.");
 
+    AddAttr<int>("acc_steps", "The gradient accumulation steps.").SetDefault(1);
     AddAttr<float>("beta1", "The initial Beta1Pow value.");
     AddAttr<float>("beta2", "The initial Beta2Pow value.");
     AddAttr<float>("epsilon",
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index f445a140f27a3..c857c6de4d093 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -1041,6 +1041,58 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel,
   }
 }
 
+template <typename T1, typename T2, typename T3, int VecSize>
+static __global__ void ElementwiseAddWithCastCUDAKernel(const T1 *x,
+                                                        const T2 *y, T3 *z,
+                                                        int n) {
+  static_assert(sizeof(T1) <= sizeof(T2),
+                "sizeof(T1) must be smaller than sizeof(T2).");
+  using MT = MasterT<T2>;
+
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = (blockDim.x * gridDim.x) * VecSize;
+  for (; i + VecSize <= n; i += stride) {
+    phi::AlignedVector<T1, VecSize> x_vec;
+    phi::AlignedVector<T2, VecSize> y_vec;
+    phi::AlignedVector<T3, VecSize> z_vec;
+    phi::Load(x + i, &x_vec);
+    phi::Load(y + i, &y_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      auto x_tmp = static_cast<MT>(x_vec[j]);
+      auto y_tmp = static_cast<MT>(y_vec[j]);
+      z_vec[j] = static_cast<T3>(x_tmp + y_tmp);
+    }
+    phi::Store(z_vec, z + i);
+  }
+
+  for (; i < n; ++i) {
+    auto x_tmp = static_cast<MT>(x[i]);
+    auto y_tmp = static_cast<MT>(y[i]);
+    z[i] = static_cast<T3>(x_tmp + y_tmp);
+  }
+}
+
+template <typename T1, typename T2, typename T3>
+static void LaunchElementwiseAddWithCastKernel(
+    const platform::CUDADeviceContext &dev_ctx, const T1 *x, const T2 *y, T3 *z,
+    int n, gpuStream_t stream) {
+  int vec_size =
+      std::min(std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)),
+               GetChunkedVecSize(z, 0));
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
+
+#define PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL                            \
+  do {                                                                        \
+    ElementwiseAddWithCastCUDAKernel<T1, T2, T3, kVecSize><<<                 \
+        config.block_per_grid, config.thread_per_block, 0, stream>>>(x, y, z, \
+                                                                     n);      \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL);
+#undef PD_LAUNCH_ELEMENTWISE_ADD_WITH_CAST_KERNEL
+}
+
 template <typename T>
 class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -1051,6 +1103,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto stream = dev_ctx.stream();
     auto place = dev_ctx.GetPlace();
 
+    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
+    found_inf_t->Resize({1});
+
     // Step 1: Get fp16 param and grad tensors
     int64_t fp16_numel;
     auto *fp16_param = GetSameInOutTensorPtr<platform::float16, true>(
@@ -1095,6 +1150,128 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "Too many parameter number. Only <= %d is supported.",
                           std::numeric_limits<int>::max()));
 
+    auto acc_steps = ctx.Attr<int>("acc_steps");
+    PADDLE_ENFORCE_GE(
+        acc_steps, 1,
+        platform::errors::InvalidArgument(
+            "The gradient accumulation steps should be not less than 1."));
+    if (acc_steps > 1) {
+      auto *step_t = ctx.Output<framework::Tensor>("AccStep");
+      PADDLE_ENFORCE_NOT_NULL(
+          step_t,
+          platform::errors::InvalidArgument(
+              "Output(AccStep) cannot be nullptr when Attr(acc_steps) > 1."));
+      bool is_initialized = step_t->IsInitialized();
+      int64_t *step_ptr;
+      if (is_initialized) {
+        step_ptr = step_t->mutable_data<int64_t>(platform::CPUPlace());
+        ++(*step_ptr);
+      } else {
+        step_t->Resize({1});
+        step_ptr = step_t->mutable_data<int64_t>(platform::CPUPlace());
+        *step_ptr = 1;
+      }
+      int64_t rounded_step = (*step_ptr) % acc_steps;
+
+      float *fp32_acc_grad = nullptr;
+      if (has_fp32_param) {
+        auto *fp32_acc_grad_t =
+            ctx.Output<framework::Tensor>("FP32AccFusedGrad");
+        PADDLE_ENFORCE_NOT_NULL(
+            fp32_acc_grad_t, platform::errors::InvalidArgument(
+                                 "Output(FP32AccFusedGrad) cannot be nullptr "
+                                 "when Attr(acc_steps) > 1."));
+        if (!fp32_acc_grad_t->IsInitialized()) {
+          fp32_acc_grad_t->Resize({static_cast<int64_t>(fp32_numel)});
+          fp32_acc_grad = fp32_acc_grad_t->mutable_data<float>(place);
+        } else {
+          fp32_acc_grad = fp32_acc_grad_t->data<float>();
+        }
+      }
+
+      platform::float16 *fp16_acc_grad = nullptr;
+      float *master_acc_grad = nullptr;
+      if (has_fp16_param) {
+        auto *fp16_acc_grad_t =
+            ctx.Output<framework::Tensor>("FP16AccFusedGrad");
+        PADDLE_ENFORCE_NOT_NULL(
+            fp16_acc_grad_t, platform::errors::InvalidArgument(
+                                 "Output(FP16AccFusedGrad) cannot be nullptr "
+                                 "when Attr(acc_steps) > 1."));
+        if (!fp16_acc_grad_t->IsInitialized()) {
+          fp16_acc_grad_t->Resize({static_cast<int64_t>(3 * fp16_numel)});
+          fp16_acc_grad =
+              fp16_acc_grad_t->mutable_data<platform::float16>(place);
+        } else {
+          fp16_acc_grad = fp16_acc_grad_t->data<platform::float16>();
+        }
+        master_acc_grad = reinterpret_cast<float *>(fp16_acc_grad + fp16_numel);
+      }
+
+      // Inplace addto
+      if (has_fp32_param) {
+        if (rounded_step == 1) {
+          memory::Copy(place, fp32_acc_grad, place, fp32_grad,
+                       fp32_numel * sizeof(float), stream);
+        } else {
+          LaunchElementwiseAddWithCastKernel(dev_ctx, fp32_grad, fp32_acc_grad,
+                                             fp32_acc_grad, fp32_numel, stream);
+        }
+      }
+
+      if (has_fp16_param) {
+        if (acc_steps == 2) {
+          if (rounded_step == 0) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad,
+                                               fp16_grad, fp16_acc_grad,
+                                               fp16_numel, stream);
+          } else {
+            memory::Copy(place, fp16_acc_grad, place, fp16_grad,
+                         fp16_numel * sizeof(platform::float16), stream);
+          }
+        } else {  // acc_steps >= 3
+          if (rounded_step == 0) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               master_acc_grad, fp16_acc_grad,
+                                               fp16_numel, stream);
+          } else if (rounded_step == 1) {
+            memory::Copy(place, fp16_acc_grad, place, fp16_grad,
+                         fp16_numel * sizeof(platform::float16), stream);
+          } else if (rounded_step == 2) {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               fp16_acc_grad, master_acc_grad,
+                                               fp16_numel, stream);
+          } else {
+            LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_grad,
+                                               master_acc_grad, master_acc_grad,
+                                               fp16_numel, stream);
+          }
+        }
+      }
+
+      auto *stop_update_t = ctx.Output<framework::Tensor>("StopUpdate");
+      stop_update_t->Resize({1});
+      auto *stop_update =
+          stop_update_t->mutable_data<bool>(platform::CPUPlace());
+
+      auto *found_inf_cpu =
+          found_inf_t->mutable_data<bool>(platform::CPUPlace());
+
+      if (rounded_step != 0) {
+        *stop_update = true;
+        auto *found_inf_cpu =
+            found_inf_t->mutable_data<bool>(platform::CPUPlace());
+        *found_inf_cpu = false;
+        return;
+      } else {
+        // swap pointer
+        fp32_grad = fp32_acc_grad;
+        fp16_grad = fp16_acc_grad;
+        *stop_update = false;
+        found_inf_t->clear();
+      }
+    }
+
     // Step 3: Get ParamInfo
     const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
     auto fp32_local_start_idx = param_info_tensor[0];
@@ -1122,7 +1299,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             << " , fp16_global_param_num = " << fp16_global_param_num;
 
     // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
-    // GlobalScale, FoundInf
+    // GlobalScale
     const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
     const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
     int64_t partial_numel = 0;
@@ -1157,8 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto *beta2pow =
         GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
 
-    auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
-    found_inf_t->Resize({1});
     auto *found_inf = found_inf_t->mutable_data<bool>(place);
 
     // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
diff --git a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
old mode 100644
new mode 100755
index cf6369eecdf9c..4ffca35ea5694
--- a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
@@ -122,6 +122,7 @@ void TestShardSendRecv(
 void PressTestSendRecv(
     std::shared_ptr<distributed::HeterClient> heter_client_ptr_) {
   // long l = 0, m = 0;
+  // https://paddlerec.bj.bcebos.com/online_infer/arm_brpc_ubuntu18/send_20_34
   std::ifstream file("/send_20_34", std::ios::in | std::ios::binary);
   // l = file.tellg();
   // file.seekg(0, std::ios::end);
@@ -129,13 +130,13 @@ void PressTestSendRecv(
   // file.close();
   // VLOG(0) << "size of file " << "20_34" << " is " << (m - l) << " bytes.\n";
   int64_t vars_len = 2359296 * sizeof(float);
-  int64_t data_size = vars_len * sizeof(float);
+  int64_t data_size = vars_len;
   VLOG(0) << "float num: " << data_size;
   float* data_ptr = new float[data_size];
   file.read((char*)data_ptr, 9437184);
   VLOG(0) << "send data is: " << data_ptr[0] << ", " << data_ptr[1];
   std::vector<std::string> var_names{"34"};
-  int loopCnt = 600;
+  int loopCnt = 10000;
   auto send_async = [&]() -> void {
     int i = 0;
     while (i++ < loopCnt) {
@@ -254,8 +255,8 @@ TEST(HETERSENDANDRECV, CPU) {
   exe.Prepare(program, 0);  // solve undefined symbol: tensor_table.cc
 
   // TestScopeSendRecv(heter_client_ptr_);
-  TestShardSendRecv(heter_client_ptr_);
-  // PressTestSendRecv(heter_client_ptr_);
+  // TestShardSendRecv(heter_client_ptr_);
+  PressTestSendRecv(heter_client_ptr_);
 
   switch_server_ptr_a->Stop();
   LOG(INFO) << "switch server A stopped";
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
index 16c7a4794bb50..b33859153419c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
index f9f015804e11d..037dab396c757 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index ff1ddb4175fef..76641698ead67 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
 #endif
@@ -613,7 +613,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
   virtual std::string GetOpType() const = 0;
 };
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 template <typename T, template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
 class ReduceCudaKernel : public framework::OpKernel<T> {
@@ -626,9 +626,12 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     auto pt_out_dtype = paddle::framework::TransToPhiDataType(
         static_cast<framework::proto::VarType::Type>(out_dtype));
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
-
+#ifdef PADDLE_WITH_XPU_KP
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+#else
     auto& dev_ctx = context.cuda_device_context();
-
+#endif
     if (out_dtype >= 0) {
       output->mutable_data(dev_ctx.GetPlace(), pt_out_dtype);
     } else {
@@ -642,6 +645,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
   }
 };
 
+#ifndef PADDLE_WITH_XPU_KP
 template <typename T, template <typename, typename> class TransformOp>
 class ReduceCudaGradKernel : public framework::OpKernel<T> {
  public:
@@ -686,6 +690,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
   }
 };
 #endif
+#endif
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 89e3b74bb3aca..eb82389702ca4 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
@@ -49,6 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
+#ifdef PADDLE_WITH_TESTING
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
+                            "Whether to print the message of gpu memory usage "
+                            "at exit, mainly used for UT and CI.");
+#endif
+
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
 USE_GPU_MEM_STAT;
@@ -137,12 +144,31 @@ class RecordedGpuMallocHelper {
     if (NeedRecord()) {
       mtx_.reset(new std::mutex());
     }
+
+#ifdef PADDLE_WITH_TESTING
+    if (FLAGS_enable_gpu_memory_usage_log) {
+      // A fake UPDATE to trigger the construction of memory stat instances,
+      // make sure that they are destructed after RecordedGpuMallocHelper.
+      MEMORY_STAT_UPDATE(Reserved, dev_id, 0);
+    }
+#endif
   }
 
   DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper);
 
  public:
+  ~RecordedGpuMallocHelper() {
+#ifdef PADDLE_WITH_TESTING
+    if (FLAGS_enable_gpu_memory_usage_log) {
+      std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : "
+                << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl;
+    }
+#endif
+  }
+
   static RecordedGpuMallocHelper *Instance(int dev_id) {
+    static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
+
     std::call_once(once_flag_, [] {
       int dev_cnt = GetGPUDeviceCount();
       instances_.reserve(dev_cnt);
@@ -326,14 +352,11 @@ class RecordedGpuMallocHelper {
   mutable std::unique_ptr<std::mutex> mtx_;
 
   static std::once_flag once_flag_;
-  static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
 
   std::set<void *> gpu_ptrs;  // just for testing
 };                            // NOLINT
 
 std::once_flag RecordedGpuMallocHelper::once_flag_;
-std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
-    RecordedGpuMallocHelper::instances_;
 
 gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
                              bool malloc_managed_memory) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 012294d0fff85..0871624a5d749 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -13,12 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_backend.h"
-#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
 
 namespace paddle {
 namespace platform {
@@ -40,7 +38,7 @@ IpuBackend::~IpuBackend() {
   executor_.reset();
 }
 
-void IpuBackend::Compile(Graph* graph,
+void IpuBackend::Compile(framework::ir::Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
@@ -63,8 +61,8 @@ void IpuBackend::Compile(Graph* graph,
   VLOG(10) << "leave IpuBackend::Compile";
 }
 
-void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
-                     const std::vector<Tensor*>& outputs,
+void IpuBackend::Run(const std::vector<const framework::Tensor*>& inputs,
+                     const std::vector<framework::Tensor*>& outputs,
                      const framework::ExecutionContext& ctx) {
   timer_->Start();
   executor_->Run(inputs, outputs, ctx);
@@ -82,7 +80,7 @@ void IpuBackend::Reset() {
   executor_.reset();
 }
 
-void IpuBackend::SetScope(const Scope& scope) {
+void IpuBackend::SetScope(const framework::Scope& scope) {
   scope_ = &scope;
   executor_->SetScope(&scope);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index 0578d9face675..1e083e7a3518c 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -18,26 +18,25 @@ limitations under the License. */
 #include <popart/names.hpp>
 #include <popart/tensorinfo.hpp>
 
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/device/ipu/ipu_device.h"
-#include "paddle/fluid/platform/device/ipu/ipu_executor.h"
 #include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
-#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/timer.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-// IpuBackend is the center of paddle-ipu, its function include:
-//   1. Compile paddle model to popart model
-//   2. Run popart model, inference or training
-//   3. Request and release device
-//   4. Other helper function
+class IpuStrategy;
+class Compiler;
+class Executor;
+
 class IpuBackend {
  public:
   static IpuBackend *GetInstance();
@@ -46,47 +45,46 @@ class IpuBackend {
   IpuBackend();
   ~IpuBackend();
 
-  // what compile does include(call compiler_):
-  //   1. map paddle-op -> poart op
-  //   2. construct popart onnx compute graph
-  void Compile(Graph *graph, const std::vector<std::string> &feed_list,
+  // What compile method does:
+  // Convert paddle ops to popart ops;
+  // Construct a popart graph, which is a onnx compute graph;
+  // Load the graph and weights to ipu.
+  void Compile(framework::ir::Graph *graph,
+               const std::vector<std::string> &feed_list,
                const std::vector<std::string> &fetch_list);
 
-  // what run does include:
-  //   1. construct forward onnx graph
-  //   2. graph-level optimization
-  //   3. autodiff
-  void Run(const std::vector<const Tensor *> &inputs,
-           const std::vector<Tensor *> &outputs,
+  // Run the compiled graph on ipu
+  void Run(const std::vector<const framework::Tensor *> &inputs,
+           const std::vector<framework::Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
   // Sync weights from IPU while training
   void WeightsToHost();
 
-  // detach IPU manually
+  // Detach IPU manually
   void Detach();
 
-  // reset manually
-  // call it before destruct works
+  // Reset manually
+  // Call it before destruct works
   void Reset();
 
-  void SetScope(const Scope &scope);
-  const Scope *GetScope() { return scope_; }
+  void SetScope(const framework::Scope &scope);
+  const framework::Scope *GetScope() { return scope_; }
   void SetIpuStrategy(const IpuStrategy &strategy);
   const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
 
-  // save compiled model to onnx
+  // Save compiled model to onnx
   void SaveModelProto(const std::string &path);
 
  private:
-  // not own
-  const Scope *scope_ = nullptr;
+  // Not own
+  const framework::Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
 
-  // own
+  // Own
   std::unique_ptr<Compiler> compiler_;
   std::unique_ptr<Executor> executor_;
-  std::unique_ptr<platform::Timer> timer_;
+  std::unique_ptr<Timer> timer_;
 
   bool is_compiled_ = false;
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index 7ae3b2303decd..f2a37aae369ec 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -20,12 +20,110 @@
 #include <popart/sgd.hpp>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+namespace {
+
+struct CustomOpAttrVisitor : public boost::static_visitor<void> {
+  CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
+                      const std::string& attr_name)
+      : attrs_(attr), attr_name_(attr_name) {}
+
+  mutable std::map<std::string, popart::any>* attrs_;
+  std::string attr_name_;
+
+  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::string& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<int>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<float>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<std::string>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<bool>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(BlockDesc* desc) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type when extracting "
+        "custom operator attributes."));
+  }
+  void operator()(const std::vector<BlockDesc*>& v) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `BlockDesc` type when extracting  "
+        "custom operator attributes."));
+  }
+  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
+  void operator()(const std::vector<int64_t>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(const std::vector<double>& v) const {
+    attrs_->emplace(attr_name_, v);
+  }
+  void operator()(boost::blank) const {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Unsupported calling method for `boost::blank` type when extracting "
+        "custom operator attributes."));
+  }
+};
+
+struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
+  ConstantOpAttrVisitor(framework::LoDTensor* tensor, VarType::Type dtype)
+      : tensor_(tensor), dtype_(dtype) {}
+
+  framework::LoDTensor* tensor_;
+  VarType::Type dtype_;
+
+  void operator()(const std::vector<int>& vec) const {
+    framework::TensorFromVector<int>(vec, tensor_);
+  }
+  void operator()(const std::vector<float>& vec) const {
+    if (dtype_ == VarType::FP16) {
+      std::vector<float16> vec_fp16;
+      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
+                     [](float f) -> float16 { return float16(f); });
+      framework::TensorFromVector<float16>(vec_fp16, tensor_);
+    } else {
+      framework::TensorFromVector<float>(vec, tensor_);
+    }
+  }
+  void operator()(const std::vector<bool>& vec) const {
+    framework::TensorFromVector<bool>(vec, tensor_);
+  }
+  void operator()(const std::vector<int64_t>& vec) const {
+    framework::TensorFromVector<int64_t>(vec, tensor_);
+  }
+  void operator()(const std::vector<double>& vec) const {
+    framework::TensorFromVector<double>(vec, tensor_);
+  }
+#define RAISE_ERROR \
+  PADDLE_THROW(     \
+      platform::errors::InvalidArgument("Constant value must be a vector"))
+  void operator()(int v) const { RAISE_ERROR; }
+  void operator()(float v) const { RAISE_ERROR; }
+  void operator()(const std::string& v) const { RAISE_ERROR; }
+  void operator()(const std::vector<std::string>& v) const { RAISE_ERROR; }
+  void operator()(bool v) const { RAISE_ERROR; }
+  void operator()(BlockDesc* desc) const { RAISE_ERROR; }
+  void operator()(const std::vector<BlockDesc*>& v) const { RAISE_ERROR; }
+  void operator()(int64_t v) const { RAISE_ERROR; }
+  void operator()(boost::blank) const { RAISE_ERROR; }
+#undef RAISE_ERROR
+};
+
 popart::AdamMode AdamModeFromStr(const std::string& str,
                                  const bool& use_no_bias_optimizer) {
   if (str == "adam") {
@@ -117,6 +215,34 @@ TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
   }
 }
 
+// Helper for adding namescope info
+struct NameScopeHelper {
+  NameScopeHelper(const OpDesc* op, popart::Builder* builder);
+
+  ~NameScopeHelper() {
+    if (pushed_) {
+      builder_->popNameScope();
+    }
+  }
+
+  bool pushed_ = false;
+  popart::Builder* builder_;
+};
+
+NameScopeHelper::NameScopeHelper(const OpDesc* op, popart::Builder* builder)
+    : builder_(builder) {
+  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
+  if (op_namescope.empty() || op_namescope == "/") {
+    return;
+  }
+  op_namescope.pop_back();
+  op_namescope.erase(op_namescope.begin());
+  builder->pushNameScope(op_namescope);
+  pushed_ = true;
+}
+
+}  // namespace
+
 GraphHelper::GraphHelper(const Graph* g) {
   graph = g;
   sorted_ops = framework::ir::TopologySortOperations(*g);
@@ -181,17 +307,12 @@ void Compiler::RegisterOpFunc() {
      auto op_type = op_desc->Type();                          \
      VLOG(10) << "build op:" << op_type << " args " << #Args; \
      auto inputs = GetOpInputs(op_desc);                      \
-     auto output_names = GetOpOutputs(op_desc);               \
      auto debug_context = BuildDebugContext(op_desc);         \
      auto aiGraphcoreOpset = builder_->aiGraphcoreOpset1();   \
      auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
-     PushNameScope(op_desc);                                  \
+     NameScopeHelper ns_helper(op_desc, builder_.get());      \
      auto output_ids = OnnxImpl(inputs Args, debug_context);  \
-     PopNameScope(op_desc);                                   \
-     SetIpuIndexStage(output_ids, op_desc);                   \
-     SetAMPAttributes(output_ids, op_desc);                   \
-     SetSerializeAttributes(output_ids, op_desc);             \
-     InsertTensors(output_names, output_ids);                 \
+     PostLower(output_ids, op_desc);                          \
    }},  // NOLINT
 #include "paddle/fluid/platform/device/ipu/supported_ops_autogen.h"
 #include "paddle/fluid/platform/device/ipu/supported_ops_custom.h"
@@ -222,7 +343,7 @@ void Compiler::InitInputs(const std::vector<std::string>& feed_list) {
     auto* node = graph_helper_->vars_name_map[feed_name];
     auto* var_desc = node->Var();
     VLOG(10) << "feed_name= " << var_desc->Name();
-    auto data_type = VarType2PopartType(var_desc->GetDataType());
+    auto data_type = VarType2PopartDType(var_desc->GetDataType());
     popart::TensorInfo input_info{data_type, var_desc->GetShape()};
     VLOG(10) << "popart input_info = " << input_info;
     popart::TensorId tensor_id =
@@ -258,8 +379,9 @@ void Compiler::LowerConstants(const Scope* scope) {
       auto shape =
           BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
       auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
-      auto dtype = PopartType2VarType(OnnxDtype2PopartType(dtype_));
-      auto tensor_name = op_desc->Output("__outputs__")[0];
+      auto dtype = PopartDType2VarType(
+          OnnxDType2PopartType(static_cast<ONNXDataType>(dtype_)));
+      auto tensor_name = GetOpOutputs(op_desc).front();
       auto* var = kid_scope.Var(tensor_name);
       VLOG(10) << "lowering constant: " << tensor_name;
       auto* tensor = var->GetMutable<framework::LoDTensor>();
@@ -270,13 +392,12 @@ void Compiler::LowerConstants(const Scope* scope) {
       tensor->Resize(ddim);
 
       auto const_data = std::unique_ptr<popart::ConstVoidData>();
-      popart::TensorInfo tensor_info(PdDataType2PopartType(tensor->dtype()),
+      popart::TensorInfo tensor_info(PhiDType2PopartDType(tensor->dtype()),
                                      shape);
       const_data.reset(new popart::ConstVoidData(tensor->data(), tensor_info));
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(result, op_desc);
+      PostLower(result, op_desc);
       resources_->tensors.emplace(tensor_name, result);
     }
   }
@@ -285,42 +406,42 @@ void Compiler::LowerConstants(const Scope* scope) {
 
 void Compiler::LowerWeights(const Scope* scope) {
   VLOG(10) << "enter Compiler::LowerWeights";
-  // at this step, the graph doesn't contains optimizer related states
+  // At this step, the graph doesn't contains optimizer related states
   for (auto id : graph_helper_->sorted_vars_id) {
     auto* node = graph_helper_->nodes_id_map[id];
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      if (node->Var()->Persistable() && node->inputs.empty()) {
-        auto var_name = node->Var()->Name();
-        if (resources_->tensors.count(var_name) != 0) {
-          VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
-          continue;
-        }
-        if (var_name.rfind("learning_rate", 0) == 0) {
-          VLOG(10) << "skip learning_rate_var: " << var_name;
-          continue;
-        }
-        VLOG(10) << "lowering weight: " << var_name;
-
-        auto var = scope->FindVar(var_name);
-        if (var) {
-          auto tensor = var->Get<framework::LoDTensor>();
-          auto dtype = PdDataType2PopartType(tensor.dtype());
-          auto shape = std::vector<int64_t>();
-          for (size_t i = 0; i < tensor.dims().size(); ++i) {
-            shape.push_back(tensor.dims().at(i));
-          }
-          popart::TensorInfo tensor_info(dtype, shape);
-          popart::ConstVoidData const_data{tensor.data(), tensor_info};
-          if (!node->outputs.empty()) {
-            auto op_node = node->outputs[0];
-            PushNameScope(op_node->Op());
-            popart::TensorId result =
-                builder_->addInitializedInputTensor(const_data, var_name);
-            PopNameScope(op_node->Op());
-            resources_->tensors.emplace(var_name, result);
-            resources_->weights.push_back(var_name);
-          }
-        }
+    // Weights are var node and Persistable
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var() &&
+        node->Var()->Persistable()) {
+      // Weights are Parameter in training mode
+      if (ipu_strategy_->is_training && !node->Var()->IsParameter()) {
+        continue;
+      }
+      auto var_name = node->Var()->Name();
+      // Some op has same input and output tensor, like batchnorm
+      if (resources_->tensors.count(var_name) != 0) {
+        VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
+        continue;
+      }
+      VLOG(10) << "lowering weight: " << var_name;
+      auto var = scope->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound("Tensor %s is not found in the scope",
+                                          var_name));
+      auto tensor = var->Get<framework::LoDTensor>();
+      auto dtype = PhiDType2PopartDType(tensor.dtype());
+      auto shape = std::vector<int64_t>();
+      for (size_t i = 0; i < tensor.dims().size(); ++i) {
+        shape.push_back(tensor.dims().at(i));
+      }
+      popart::TensorInfo tensor_info(dtype, shape);
+      popart::ConstVoidData const_data{tensor.data(), tensor_info};
+      if (!node->outputs.empty()) {
+        auto op_node = node->outputs[0];
+        NameScopeHelper ns_helper(op_node->Op(), builder_.get());
+        popart::TensorId result =
+            builder_->addInitializedInputTensor(const_data, var_name);
+        resources_->tensors.emplace(var_name, result);
+        resources_->weights.push_back(var_name);
       }
     }
   }
@@ -340,12 +461,9 @@ void Compiler::LowerBody() {
       // pass
     } else if (op_type == "popart_checkpointoutput") {
       auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       auto output_ids = builder_->checkpointOutput(inputs);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
+      PostLower(output_ids, op_desc);
     } else if (op_type == "popart_custom_op") {
       auto inputs = GetOpInputs(op_desc);
       auto outputs = GetOpOutputs(op_desc);
@@ -359,26 +477,21 @@ void Compiler::LowerBody() {
           BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
       VLOG(10) << "Build graph from custom op: " << __op_type;
       auto it = custom_ops_.find(__op_type);
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       auto output_ids =
           builder_->customOp(it->second.popart_op, it->second.popart_op.version,
                              inputs, outputs.size(), attributes, debug_context);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
+      PostLower(output_ids, op_desc);
     } else if (op_type == "popart_printtensor") {
       auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
       auto debug_context = BuildDebugContext(op_desc);
       auto print_gradient =
           BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
       auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
-      PushNameScope(op_desc);
+      NameScopeHelper ns_helper(op_desc, builder_.get());
       auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
           inputs, print_gradient, debug_context, title);
-      PopNameScope(op_desc);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
+      PostLower(output_ids, op_desc);
     } else {
       auto itr = name_function_.find(op_type);
       if (itr != name_function_.end()) {
@@ -608,29 +721,13 @@ void Compiler::LowerOptimizer(const Scope* scope) {
   }
 }
 
-void Compiler::InsertTensors(const std::vector<std::string>& output_names,
-                             const std::vector<std::string>& tensor_ids) {
-  PADDLE_ENFORCE_EQ(output_names.size(), tensor_ids.size(),
-                    platform::errors::Fatal("InsertTensors size mismatch"));
-  for (int i = 0; i < tensor_ids.size(); i++) {
-    std::string tensor_id = tensor_ids[i];
-    resources_->tensors.emplace(output_names[i], tensor_ids[i]);
-  }
-}
-
-void Compiler::InsertTensors(const std::vector<std::string>& output_names,
-                             const std::string& tensor_id) {
-  PADDLE_ENFORCE_EQ(output_names.size(), 1,
-                    platform::errors::Fatal("InsertTensors size mismatch"));
-  resources_->tensors.emplace(output_names[0], tensor_id);
-}
-
-void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+void Compiler::PostLower(const std::vector<std::string>& tensor_ids,
+                         const OpDesc* op_desc) {
+  // Set pipline
+  // Due to the limitation of popart, if an op has multiple outputs,
+  // pipline settings needs to be set at the same time
   auto tensor_ids_set =
       std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
-
   if (op_desc->HasAttr(sIpuIndexAttr)) {
     auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
     builder_->virtualGraph(tensor_ids_set, ipu_index);
@@ -639,18 +736,37 @@ void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
     if (op_desc->HasAttr(sIpuStageAttr)) {
       auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
       builder_->pipelineStage(tensor_ids_set, ipu_stage);
-      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+      VLOG(10) << "set " << sIpuStageAttr << " = " << ipu_stage
                << " for op: " << op_desc->Type();
     }
   }
-  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+  // Record output tensors
+  auto pd_outs = GetOpOutputs(op_desc);
+  PADDLE_ENFORCE_EQ(
+      pd_outs.size(), tensor_ids.size(),
+      platform::errors::Fatal("paddle and popart op have different outputs"));
+  for (int i = 0; i < tensor_ids.size(); ++i) {
+    resources_->tensors.emplace(pd_outs[i], tensor_ids[i]);
+  }
+  for (auto& tensor_id : tensor_ids) {
+    PostLower(tensor_id, op_desc, true);
+  }
 }
 
-void Compiler::SetIpuIndexStage(const std::string& tensor_id,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc) {
+  // Record output tensor
+  auto pd_outs = GetOpOutputs(op_desc);
+  PADDLE_ENFORCE_EQ(
+      pd_outs.size(), 1,
+      platform::errors::Fatal("paddle and popart op have different outputs"));
+  resources_->tensors.emplace(pd_outs[0], tensor_id);
+  PostLower(tensor_id, op_desc, false);
+}
 
-  if (op_desc->HasAttr(sIpuIndexAttr)) {
+void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc,
+                         bool skip_pipline) {
+  // Set pipline
+  if (!skip_pipline && op_desc->HasAttr(sIpuIndexAttr)) {
     auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuIndexAttr));
     builder_->virtualGraph(tensor_id, ipu_index);
     VLOG(10) << "set " << sIpuIndexAttr << " = " << ipu_index
@@ -658,32 +774,18 @@ void Compiler::SetIpuIndexStage(const std::string& tensor_id,
     if (op_desc->HasAttr(sIpuStageAttr)) {
       auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr(sIpuStageAttr));
       builder_->pipelineStage(tensor_id, ipu_stage);
-      VLOG(10) << "set " << sIpuStageAttr << "= " << ipu_stage
+      VLOG(10) << "set " << sIpuStageAttr << " = " << ipu_stage
                << " for op: " << op_desc->Type();
     }
   }
-  VLOG(10) << "leave Compiler::SetIpuIndexStage";
-}
-
-void Compiler::SetAMPAttributes(const std::vector<std::string>& tensor_ids,
-                                const OpDesc* op_desc) {
-  if (op_desc->Type() == "popart_matmul") {
-    for (const auto& tensor_id : tensor_ids) {
-      SetAMPAttributes(tensor_id, op_desc);
-    }
-  }
-}
-
-void Compiler::SetAMPAttributes(const std::string& tensor_id,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetAMPAttributes";
+  // Set amp
   if (op_desc->Type() == "popart_matmul") {
     if (set_amp_for_all_) {
       auto amp = ipu_strategy_->available_memory_proportion;
       if (amp < 0.0f || amp > 1.0) {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "AvailableMemoryProportion %f is invalid, which should be set 0 <= "
-            "amp <= 1",
+            "AvailableMemoryProportion %f is invalid, which should be in "
+            "range [0.0, 1.0]",
             amp));
       }
       if (amp > 0.0f) {
@@ -694,8 +796,8 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
         auto amp = BOOST_GET_CONST(float, op_desc->GetAttr(sAvailMemAttribute));
         if (amp < 0.0f || amp > 1.0) {
           PADDLE_THROW(platform::errors::InvalidArgument(
-              "AvailableMemoryProportion %f is invalid, which should be set 0 "
-              "<= amp <= 1",
+              "AvailableMemoryProportion %f is invalid, which should be in "
+              "range [0.0, 1.0]",
               amp));
         }
         if (amp > 0.0f) {
@@ -705,17 +807,7 @@ void Compiler::SetAMPAttributes(const std::string& tensor_id,
         }
       }
     }
-  }
-  VLOG(10) << "leave Compiler::SetAMPAttributes";
-}
-
-void Compiler::SetSerializeAttributes(
-    const std::vector<std::string>& tensor_ids, const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetSerializeAttributes";
-  auto tensor_ids_set =
-      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
-
-  if (op_desc->Type() == "popart_matmul") {
+    // Set serialize matmul
     if (op_desc->HasAttr(sMatmulSerializeFactor)) {
       auto factor =
           BOOST_GET_CONST(int, op_desc->GetAttr(sMatmulSerializeFactor));
@@ -724,16 +816,9 @@ void Compiler::SetSerializeAttributes(
         mode = BOOST_GET_CONST(std::string,
                                op_desc->GetAttr(sMatmulSerializeMode));
       }
-      builder_->setSerializeMatMul(tensor_ids_set, mode, (int64_t)factor, true);
+      builder_->setSerializeMatMul({tensor_id}, mode, factor, true);
     }
   }
-  VLOG(10) << "leave Compiler::SetSerializeAttributes";
-}
-
-void Compiler::SetSerializeAttributes(const std::string& tensor_id,
-                                      const OpDesc* op_desc) {
-  std::vector<std::string> tensor_ids = {tensor_id};
-  SetSerializeAttributes(tensor_ids, op_desc);
 }
 
 void Compiler::SetCustomOps(
@@ -749,13 +834,7 @@ std::string Compiler::GetFP16ModelProto() {
   return graph_transformer.getModelProto();
 }
 
-std::string Compiler::GetModelProto() {
-  if (ipu_strategy_->enable_fp16) {
-    return GetFP16ModelProto();
-  } else {
-    return builder_->getModelProto();
-  }
-}
+std::string Compiler::GetModelProto() { return builder_->getModelProto(); }
 
 void Compiler::SaveModelProto(const std::string& path) {
   builder_->saveModelProto(path);
@@ -793,29 +872,6 @@ popart::DebugContext Compiler::BuildDebugContext(const OpDesc* op) {
   return popart::DebugContext(op_identify_id);
 }
 
-void Compiler::PushNameScope(const OpDesc* op) {
-  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
-  if (op_namescope == "/") {
-    return;
-  }
-  if (!op_namescope.empty()) {
-    op_namescope.pop_back();
-  }
-  if (!op_namescope.empty()) {
-    op_namescope.erase(op_namescope.begin());
-  }
-  VLOG(10) << "name_scope is: " << op_namescope;
-  builder_->pushNameScope(op_namescope);
-}
-
-void Compiler::PopNameScope(const OpDesc* op) {
-  auto op_namescope = BOOST_GET_CONST(std::string, op->GetAttr(sOpNamescope));
-  if (op_namescope == "/") {
-    return;
-  }
-  builder_->popNameScope();
-}
-
 }  // namespace ipu
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 2d00970bf1297..6f4e602af82df 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -17,16 +17,15 @@
 #include <popart/builder.hpp>
 #include <popart/graphtransformer.hpp>
 #include <popart/optimizer.hpp>
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device/ipu/ipu_names.h"
-#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+class IpuStrategy;
+
 struct CompilerResources {
   // popart input tensor_ids
   std::vector<popart::TensorId> inputs;
@@ -70,7 +69,7 @@ struct CompilerResources {
   std::unique_ptr<popart::Optimizer> optimizer;
 };
 
-// helper for lowering graph
+// Helper for lowering graph
 struct GraphHelper {
   explicit GraphHelper(const Graph *);
 
@@ -114,23 +113,9 @@ class Compiler {
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
   const std::string GetNameScope(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
-
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::vector<std::string> &tensor_ids);
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::string &tensor_id);
-  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
-                              const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::string &tensor_id,
-                              const OpDesc *op_desc);
-  void PushNameScope(const OpDesc *op);
-  void PopNameScope(const OpDesc *op);
+  void PostLower(const std::vector<std::string> &, const OpDesc *);
+  void PostLower(const std::string &, const OpDesc *);
+  void PostLower(const std::string &, const OpDesc *, bool);
 
  private:
   std::unique_ptr<popart::Builder> builder_;
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
index 2459f5140eb5b..b7a83b2ef1a61 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -13,14 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_device.h"
+
+#include <popart/devicemanager.hpp>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-// TODO(alleng) merge with ipu_utils
-static bool GetBoolEnv(std::string str) {
+namespace {
+const bool GetBoolEnv(const std::string& str) {
   char* str_val = getenv(str.c_str());
   if (str_val == NULL) {
     return false;
@@ -32,6 +35,7 @@ static bool GetBoolEnv(std::string str) {
     return val;
   }
 }
+}  // namespace
 
 int GetNumDevices() {
   bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.h b/paddle/fluid/platform/device/ipu/ipu_device.h
index d39feffc92655..c6876c032c8e4 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.h
+++ b/paddle/fluid/platform/device/ipu/ipu_device.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <popart/devicemanager.hpp>
+#include <vector>
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 649b291244110..b020e4f219743 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -14,12 +14,80 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_executor.h"
 
-using float16 = paddle::platform::float16;
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
+#include "paddle/fluid/platform/device/ipu/ipu_names.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+namespace {
+
+// Get paddle prefix and popart postfix of weight states
+// Format: {popart_postfix, paddle_prefix}
+std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
+    const std::string &opt_type) {
+  std::vector<std::pair<std::string, std::string>> pre_post_fix;
+  // Weight self
+  pre_post_fix.push_back(std::make_pair("", ""));
+
+  // Weight states
+  // TODO(alleng) support pair("Accl1___", "_moment1_{id!=0}")
+  if (opt_type == "adam" || opt_type == "lamb" || opt_type == "adamw") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "momentum") {
+    pre_post_fix.push_back(std::make_pair("Accl___", "_velocity_0"));
+  } else if (opt_type == "adamax") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_inf_norm__0"));
+    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
+  } else if (opt_type == "adagrad") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment_0"));
+  } else if (opt_type == "adadelta") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "__avg_squared_grad_0"));
+    pre_post_fix.push_back(
+        std::make_pair("Accl2___", "__avg_squared_update_0"));
+  } else if (opt_type == "rmsprop") {
+    pre_post_fix.push_back(std::make_pair("Accl1___", "_mean_square_0"));
+    pre_post_fix.push_back(std::make_pair("Accl2___", "_mean_grad_0"));
+    pre_post_fix.push_back(std::make_pair("Accl3___", "_momentum__0"));
+  }
+  return pre_post_fix;
+}
+
+class PdIArray final : public popart::IArray {
+ public:
+  explicit PdIArray(const Tensor *tensor) {
+    tensor_.ShareDataWith(*tensor);
+    for (int i = 0; i < tensor->dims().size(); ++i) {
+      shape_.push_back(tensor->dims().at(i));
+    }
+  }
+
+ public:
+  void *data() { return tensor_.data(); }
+  popart::DataType dataType() const {
+    return PhiDType2PopartDType(tensor_.dtype());
+  }
+  std::size_t rank() const { return tensor_.dims().size(); }
+  int64_t dim(size_t index) const { return tensor_.dims().at(index); }
+  std::size_t nelms() const {
+    return std::accumulate(shape_.begin(), shape_.end(),
+                           static_cast<int64_t>(1), std::multiplies<int64_t>());
+  }
+  const popart::Shape shape() const { return shape_; }
+
+ private:
+  Tensor tensor_;
+  std::vector<int64_t> shape_;
+};
+
+}  // namespace
+
 Executor::~Executor() {
   Detach();
   session_.reset();
@@ -76,15 +144,15 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
   VLOG(10) << "enter Executor::Run";
   // inputs
   std::map<popart::TensorId, popart::IArray &> popart_inputs;
-  std::map<popart::TensorId, PaddleIArray> input_wrappers;
+  std::map<popart::TensorId, PdIArray> input_wrappers;
   for (size_t i = 0; i < inputs.size(); i++) {
     auto tensor_id = compiler_resources_->inputs[i];
-    input_wrappers.emplace(tensor_id, PaddleIArray(inputs[i]));
+    input_wrappers.emplace(tensor_id, PdIArray(inputs[i]));
     popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
   }
   // anchors
   std::map<popart::TensorId, popart::IArray &> popart_anchors;
-  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
+  std::map<popart::TensorId, PdIArray> anchor_wrappers;
   for (size_t i = 0; i < outputs.size(); i++) {
     auto tensor_id = compiler_resources_->outputs[i];
     // get dims & dtype from session
@@ -106,10 +174,10 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
     auto *tensor = outputs[i];
     tensor->Resize(phi::make_ddim(output_shape));
     auto fetch_dtype = fetch_info.dataType();
-    auto paddle_type = PopartType2VarType(fetch_dtype);
+    auto paddle_type = PopartDType2VarType(fetch_dtype);
     tensor->mutable_data(ctx.GetPlace(),
                          framework::TransToPhiDataType(paddle_type));
-    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    anchor_wrappers.emplace(tensor_id, PdIArray(tensor));
     popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
   }
   VLOG(10) << "Prepared inputs/anchors";
@@ -169,16 +237,16 @@ void Executor::AcquireDevice() {
     device_ = popart::DeviceManager::createDeviceManager().acquireDeviceById(
         device_id);
     PADDLE_ENFORCE_NOT_NULL(
-        device_, platform::errors::Unavailable(
-                     "Can't attach IPU in distribution, ipu_num = %d.",
-                     RequestIpus(ipu_strategy_->num_ipus)));
+        device_,
+        errors::Unavailable("Can't attach IPU in distribution, ipu_num = %d.",
+                            RequestIpus(ipu_strategy_->num_ipus)));
   } else {
     device_ =
         popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
             RequestIpus(ipu_strategy_->num_ipus));
-    PADDLE_ENFORCE_NOT_NULL(device_, platform::errors::Unavailable(
-                                         "Can't attach IPU, ipu_num = %d.",
-                                         RequestIpus(ipu_strategy_->num_ipus)));
+    PADDLE_ENFORCE_NOT_NULL(
+        device_, errors::Unavailable("Can't attach IPU, ipu_num = %d.",
+                                     RequestIpus(ipu_strategy_->num_ipus)));
   }
   VLOG(10) << "leave Executor::AcquireDevice";
 }
@@ -226,13 +294,13 @@ void Executor::SetWeightsIO() {
 void Executor::ConvertWeights(bool align_to_popart) {
   for (auto weight_pair : executor_resources_->weights_and_opt_state) {
     auto paddle_var = scope_->GetVar(weight_pair.second);
-    auto paddle_var_dtype = PdDataType2PopartType(
+    auto paddle_var_dtype = PhiDType2PopartDType(
         paddle_var->GetMutable<framework::LoDTensor>()->dtype());
 
     PADDLE_ENFORCE_EQ((paddle_var_dtype == popart::DataType::FLOAT ||
                        paddle_var_dtype == popart::DataType::FLOAT16),
                       true,
-                      platform::errors::InvalidArgument(
+                      errors::InvalidArgument(
                           "Currently, we only support FLOAT16 and FLOAT with "
                           "Paddle, but received type is %s.",
                           paddle_var_dtype));
@@ -242,7 +310,7 @@ void Executor::ConvertWeights(bool align_to_popart) {
     PADDLE_ENFORCE_EQ((popart_var_dtype == popart::DataType::FLOAT ||
                        popart_var_dtype == popart::DataType::FLOAT16),
                       true,
-                      platform::errors::InvalidArgument(
+                      errors::InvalidArgument(
                           "Currently, we only support FLOAT16 and FLOAT with "
                           "popart, but received type is %s.",
                           popart_var_dtype));
@@ -276,8 +344,8 @@ void Executor::ConvertWeights(bool align_to_popart) {
                num_elem * sizeof(float));
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Convert Paddle FLOAT16 to popart FLOAT"));
+      PADDLE_THROW(
+          errors::Unimplemented("Convert Paddle FLOAT16 to popart FLOAT"));
     }
   }
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h
index c59e623ab20b0..c03a52a77a9d7 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.h
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.h
@@ -22,17 +22,21 @@ limitations under the License. */
 #include <popart/tensorinfo.hpp>
 #include <popdist/popdist_poplar.hpp>
 
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device/ipu/ipu_compiler.h"
-#include "paddle/fluid/platform/device/ipu/ipu_names.h"
-#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+}  // namespace paddle
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+struct CompilerResources;
+class IpuStrategy;
+
 struct ExecutorResources {
   // map<tensor_id, paddle_var_ptr>
   popart::WeightsIO weights_io;
@@ -45,18 +49,18 @@ class Executor {
   Executor() = default;
   ~Executor();
 
-  // build popart session
+  // Build popart session
   void Prepare(const std::string &proto);
 
-  // run popart session
+  // Run popart session
   void Run(const std::vector<const Tensor *> &inputs,
            const std::vector<Tensor *> &outputs,
            const framework::ExecutionContext &ctx);
 
-  // sync weights from popart to paddle
+  // Sync weights from popart to paddle
   void WeightsToHost();
 
-  // detach IPU
+  // Detach IPU
   void Detach();
 
   // Scope
@@ -83,16 +87,16 @@ class Executor {
   void WeightsToPaddle();
 
  private:
-  // not own
+  // Not own
   const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
   CompilerResources *compiler_resources_ = nullptr;
 
-  // deviceinfo for popart session
+  // Deviceinfo for popart session
   std::shared_ptr<popart::DeviceInfo> device_;
-  // popart session, where graph running
+  // Popart session, where graph running
   std::unique_ptr<popart::Session> session_;
-  // one OneSession means a graph
+  // A ExecutorResources corresponds to a graph
   std::unique_ptr<ExecutorResources> executor_resources_;
 };
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
index 9e6951c37139d..749628ffac452 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
+
 #include "paddle/fluid/platform/device/ipu/ipu_device.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index f52499a8d8fda..aff5498243000 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -64,7 +64,6 @@ IpuStrategy::IpuStrategy() {
   ADD_BOOL_OPTION(is_training);
   ADD_BOOL_OPTION(need_avg_shard);
   ADD_BOOL_OPTION(enable_fp16);
-  ADD_BOOL_OPTION(transfer_cast_op);
   ADD_BOOL_OPTION(use_no_bias_optimizer);
   ADD_BOOL_OPTION(enable_distribution);
   ADD_BOOL_OPTION(scaled_optimizer_state);
@@ -316,8 +315,10 @@ IpuStrategy::IpuStrategy() {
   RegisterSetter(bool_options, "enable_half_partial", [&](bool value) {
     if (value) {
       popart_options.partialsTypeMatMuls = "half";
+      popart_options.convolutionOptions.insert({{"partialsType", "half"}});
     } else {
       popart_options.partialsTypeMatMuls = "float";
+      popart_options.convolutionOptions.insert({{"partialsType", "float"}});
     }
   });
 
@@ -412,6 +413,15 @@ IpuStrategy::IpuStrategy() {
 
   RegisterGetter(map_options_getter, options_type, "gcl_options", "map",
                  [&]() { return popart_options.gclOptions; });
+
+  // Default options
+
+  // Can also be set as a custom logger in python, like using tqdm
+  popart_options.compilationProgressLogger = [](int progress, int total) {
+    if (progress % 10 == 0) {
+      VLOG(1) << "compile progress: " << progress << "%";
+    }
+  };
 }
 
 void IpuStrategy::AddBoolOption(const std::string& option, bool value) {
@@ -513,6 +523,11 @@ void IpuStrategy::AddCustomOp(const std::string& paddle_op,
       IpuCustomOpIdentifier(paddle_op, popart_op, domain, version));
 }
 
+void IpuStrategy::SetCompilationProgressLogger(
+    const std::function<void(int, int)>& logger) {
+  popart_options.compilationProgressLogger = logger;
+}
+
 std::string IpuStrategy::GetOption(const std::string& option) {
   return get(option, options_getter);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 1802eb16e5895..fa57dcd676d81 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -43,9 +43,6 @@ class IpuStrategy {
   // Flag for fp16, true for pure fp16
   bool enable_fp16 = false;
 
-  // Enable transfer cast Op target from fp32 to fp16 in fp16 mode
-  bool transfer_cast_op = true;
-
   // The mode of Adam/Lamb optimizer
   // false: The standard Adam/Lamb optimizer
   // true: The Adam_No_Bias/Lamb_No_Bias optimizer from PopART
@@ -125,6 +122,8 @@ class IpuStrategy {
                                           const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
                    const std::string &domain, int version);
+  void SetCompilationProgressLogger(
+      const std::function<void(int, int)> &logger);
 
   std::string GetOption(const std::string &);
   std::vector<std::string> GetVectorOption(const std::string &);
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.cc b/paddle/fluid/platform/device/ipu/ipu_utils.cc
index 720de822608b6..43e4a6820c813 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.cc
@@ -13,133 +13,111 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
 #include <cmath>
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-void* PaddleIArray::data() { return tensor_.data(); }
-
-popart::DataType PaddleIArray::dataType() const {
-  return PdDataType2PopartType(tensor_.dtype());
-}
-
-std::size_t PaddleIArray::rank() const { return tensor_.dims().size(); }
-
-int64_t PaddleIArray::dim(size_t index) const {
-  return tensor_.dims().at(index);
-}
-
-std::size_t PaddleIArray::nelms() const {
-  return std::accumulate(shape_.begin(), shape_.end(), static_cast<int64_t>(1),
-                         std::multiplies<int64_t>());
-}
-
-const popart::Shape PaddleIArray::shape() const { return shape_; }
-
-popart::DataType VarType2PopartType(
-    const framework::proto::VarType::Type type) {
+const popart::DataType VarType2PopartDType(const VarType::Type type) {
   switch (type) {
-    case framework::proto::VarType::UINT8:
+    case VarType::UINT8:
       return popart::DataType::UINT8;
-    case framework::proto::VarType::INT8:
+    case VarType::INT8:
       return popart::DataType::INT8;
-    case framework::proto::VarType::INT16:
+    case VarType::INT16:
       return popart::DataType::INT16;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       return popart::DataType::INT32;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       return popart::DataType::INT64;
-    case framework::proto::VarType::BOOL:
+    case VarType::BOOL:
       return popart::DataType::BOOL;
-    case framework::proto::VarType::FP64:
+    case VarType::FP64:
       return popart::DataType::DOUBLE;
-    case framework::proto::VarType::FP32:
+    case VarType::FP32:
       return popart::DataType::FLOAT;
-    case framework::proto::VarType::FP16:
+    case VarType::FP16:
       return popart::DataType::FLOAT16;
-    case framework::proto::VarType::BF16:
+    case VarType::BF16:
       return popart::DataType::BFLOAT16;
-    case framework::proto::VarType::COMPLEX64:
+    case VarType::COMPLEX64:
       return popart::DataType::COMPLEX64;
-    case framework::proto::VarType::COMPLEX128:
+    case VarType::COMPLEX128:
       return popart::DataType::COMPLEX128;
     default:
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported Paddle var type."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported VarType::Type when converting to popart data type."));
   }
 }
 
-popart::DataType PdDataType2PopartType(
-    const paddle::experimental::DataType type) {
+const popart::DataType PhiDType2PopartDType(const phi::DataType type) {
   switch (type) {
-    case paddle::experimental::DataType::UINT8:
+    case phi::DataType::UINT8:
       return popart::DataType::UINT8;
-    case paddle::experimental::DataType::INT8:
+    case phi::DataType::INT8:
       return popart::DataType::INT8;
-    case paddle::experimental::DataType::INT16:
+    case phi::DataType::INT16:
       return popart::DataType::INT16;
-    case paddle::experimental::DataType::INT32:
+    case phi::DataType::INT32:
       return popart::DataType::INT32;
-    case paddle::experimental::DataType::INT64:
+    case phi::DataType::INT64:
       return popart::DataType::INT64;
-    case paddle::experimental::DataType::BOOL:
+    case phi::DataType::BOOL:
       return popart::DataType::BOOL;
-    case paddle::experimental::DataType::FLOAT64:
+    case phi::DataType::FLOAT64:
       return popart::DataType::DOUBLE;
-    case paddle::experimental::DataType::FLOAT32:
+    case phi::DataType::FLOAT32:
       return popart::DataType::FLOAT;
-    case paddle::experimental::DataType::FLOAT16:
+    case phi::DataType::FLOAT16:
       return popart::DataType::FLOAT16;
-    case paddle::experimental::DataType::BFLOAT16:
+    case phi::DataType::BFLOAT16:
       return popart::DataType::BFLOAT16;
-    case paddle::experimental::DataType::COMPLEX64:
+    case phi::DataType::COMPLEX64:
       return popart::DataType::COMPLEX64;
-    case paddle::experimental::DataType::COMPLEX128:
+    case phi::DataType::COMPLEX128:
       return popart::DataType::COMPLEX128;
     default:
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported Paddle data type."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported phi::DataType when converting to popart data type."));
   }
 }
 
-framework::proto::VarType::Type PopartType2VarType(
-    const popart::DataType type) {
+const VarType::Type PopartDType2VarType(const popart::DataType type) {
   switch (type) {
     case popart::DataType::UINT8:
-      return framework::proto::VarType::UINT8;
+      return VarType::UINT8;
     case popart::DataType::INT8:
-      return framework::proto::VarType::INT8;
+      return VarType::INT8;
     case popart::DataType::INT16:
-      return framework::proto::VarType::INT16;
+      return VarType::INT16;
     case popart::DataType::INT32:
-      return framework::proto::VarType::INT32;
+      return VarType::INT32;
     case popart::DataType::INT64:
-      return framework::proto::VarType::INT64;
+      return VarType::INT64;
     case popart::DataType::BOOL:
-      return framework::proto::VarType::BOOL;
+      return VarType::BOOL;
     case popart::DataType::DOUBLE:
-      return framework::proto::VarType::FP64;
+      return VarType::FP64;
     case popart::DataType::FLOAT:
-      return framework::proto::VarType::FP32;
+      return VarType::FP32;
     case popart::DataType::FLOAT16:
-      return framework::proto::VarType::FP16;
+      return VarType::FP16;
     case popart::DataType::BFLOAT16:
-      return framework::proto::VarType::BF16;
+      return VarType::BF16;
     case popart::DataType::COMPLEX64:
-      return framework::proto::VarType::COMPLEX64;
+      return VarType::COMPLEX64;
     case popart::DataType::COMPLEX128:
-      return framework::proto::VarType::COMPLEX128;
+      return VarType::COMPLEX128;
     default:
-      PADDLE_THROW(paddle::platform::errors::Unavailable(
-          "Unsupported Paddle var type."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported popart::DataType when converting to var type."));
   }
 }
 
-popart::DataType OnnxDtype2PopartType(const int type) {
-  auto dtype = static_cast<ONNXDataType>(type);
-  switch (dtype) {
+const popart::DataType OnnxDType2PopartType(const ONNXDataType type) {
+  switch (type) {
     case ONNXDataType::BOOL:
       return popart::DataType::BOOL;
     case ONNXDataType::INT16:
@@ -166,12 +144,69 @@ popart::DataType OnnxDtype2PopartType(const int type) {
       return popart::DataType::COMPLEX128;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported ONNX data type: %d.", dtype));
+          "Unsupported ONNXDataType when converting to popart data type."));
   }
 }
 
-// count num should > 0
-bool GetBoolEnv(std::string str) {
+const ONNXDataType VarType2OnnxDType(const VarType::Type type) {
+  switch (type) {
+    case VarType::BOOL:
+      return ONNXDataType::BOOL;
+    case VarType::INT16:
+      return ONNXDataType::INT16;
+    case VarType::INT32:
+      return ONNXDataType::INT32;
+    case VarType::INT64:
+      return ONNXDataType::INT64;
+    case VarType::FP16:
+      return ONNXDataType::FLOAT16;
+    case VarType::FP32:
+      return ONNXDataType::FLOAT;
+    case VarType::FP64:
+      return ONNXDataType::DOUBLE;
+    case VarType::UINT8:
+      return ONNXDataType::UINT8;
+    case VarType::INT8:
+      return ONNXDataType::INT8;
+    case VarType::BF16:
+      return ONNXDataType::BFLOAT16;
+    case VarType::COMPLEX64:
+      return ONNXDataType::COMPLEX64;
+    case VarType::COMPLEX128:
+      return ONNXDataType::COMPLEX128;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported VarType::Type when converting to onnx data type."));
+  }
+}
+
+const std::string VarType2PopartStr(const VarType::Type type) {
+  switch (type) {
+    case VarType::UINT8:
+      return "UINT8";
+    case VarType::INT8:
+      return "INT8";
+    case VarType::INT16:
+      return "INT16";
+    case VarType::INT32:
+      return "INT32";
+    case VarType::INT64:
+      return "INT64";
+    case VarType::BOOL:
+      return "BOOL";
+    case VarType::FP64:
+      return "DOUBLE";
+    case VarType::FP32:
+      return "FLOAT";
+    case VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Unsupported VarType::Type when converting to popart type string."));
+  }
+}
+
+const bool GetBoolEnv(const std::string& str) {
   char* str_val = getenv(str.c_str());
   if (str_val == NULL) {
     return false;
@@ -184,29 +219,7 @@ bool GetBoolEnv(std::string str) {
   }
 }
 
-std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
-    const std::string& opt_type) {
-  // format: {popart_tensor_id, paddle_tensor_id}, ...
-  std::vector<std::pair<std::string, std::string>> pre_post_fix;
-
-  if (opt_type == "adam" || opt_type == "lamb") {
-    pre_post_fix.push_back(std::make_pair("", ""));
-    pre_post_fix.push_back(std::make_pair("Accl1___", "_moment1_0"));
-    pre_post_fix.push_back(std::make_pair("Accl2___", "_moment2_0"));
-    pre_post_fix.push_back(std::make_pair("Step___", "_beta1_pow_acc_0"));
-  } else if (opt_type == "sgd" || opt_type == "momentum") {
-    // sgd
-    pre_post_fix.push_back(std::make_pair("", ""));
-  } else {
-    pre_post_fix.push_back(std::make_pair("", ""));
-    //
-  }
-
-  return pre_post_fix;
-}
-
-int RequestIpus(const int num_ipus) {
-  // num_ipus must be pow(2, n);
+const int RequestIpus(const int num_ipus) {
   return std::pow(2, ceil(log2(num_ipus)));
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_utils.h b/paddle/fluid/platform/device/ipu/ipu_utils.h
index 7644513cc0207..2737f40295390 100644
--- a/paddle/fluid/platform/device/ipu/ipu_utils.h
+++ b/paddle/fluid/platform/device/ipu/ipu_utils.h
@@ -19,155 +19,32 @@ limitations under the License. */
 #include <popart/tensorinfo.hpp>
 #include <popart/vendored/any.hpp>
 
-#include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/float16.h"
 
+using float16 = paddle::platform::float16;
+using Tensor = paddle::framework::Tensor;
+using LoDTensor = paddle::framework::LoDTensor;
+using Scope = paddle::framework::Scope;
+using OpDesc = paddle::framework::OpDesc;
+using Graph = paddle::framework::ir::Graph;
+using Node = paddle::framework::ir::Node;
+using BlockDesc = paddle::framework::BlockDesc;
+using VarType = paddle::framework::proto::VarType;
+
 namespace paddle {
 namespace platform {
 namespace ipu {
 
-using float16 = platform::float16;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using Scope = framework::Scope;
-using OpDesc = framework::OpDesc;
-using Graph = framework::ir::Graph;
-using Node = framework::ir::Node;
-using BlockDesc = framework::BlockDesc;
-
-// onnx dtype
-// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
-enum ONNXDataType : int {
-  UNDEFINED = 0,
-  FLOAT = 1,
-  UINT8 = 2,
-  INT8 = 3,
-  UINT16 = 4,
-  INT16 = 5,
-  INT32 = 6,
-  INT64 = 7,
-  STRING = 8,
-  BOOL = 9,
-  FLOAT16 = 10,
-  DOUBLE = 11,
-  UINT32 = 12,
-  UINT64 = 13,
-  COMPLEX64 = 14,
-  COMPLEX128 = 15,
-  BFLOAT16 = 16
-};
-
-class PaddleIArray final : public popart::IArray {
- public:
-  explicit PaddleIArray(const Tensor* tensor) {
-    tensor_.ShareDataWith(*tensor);
-    for (int i = 0; i < tensor->dims().size(); ++i) {
-      shape_.push_back(tensor->dims().at(i));
-    }
-  }
-
- public:
-  void* data();
-  popart::DataType dataType() const;
-  std::size_t rank() const;
-  int64_t dim(size_t index) const;
-  std::size_t nelms() const;
-  const popart::Shape shape() const;
-
- private:
-  Tensor tensor_;
-  std::vector<int64_t> shape_;
-};
-
-popart::DataType VarType2PopartType(const framework::proto::VarType::Type type);
-popart::DataType PdDataType2PopartType(
-    const paddle::experimental::DataType type);
-framework::proto::VarType::Type PopartType2VarType(const popart::DataType type);
-popart::DataType OnnxDtype2PopartType(const int type);
-bool GetBoolEnv(std::string str);
-
-template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> Tensor2IArray(const Tensor& tensor) {
-  auto dtype = PdDataType2PopartType(tensor.dtype());
-  auto shape = std::vector<int64_t>();
-  for (size_t i = 0; i < tensor.dims().size(); ++i) {
-    shape.push_back(tensor.dims().at(i));
-  }
-  popart::TensorInfo tensor_info(dtype, shape);
-
-  return std::make_unique<popart::NDArrayWrapper<T>>(
-      reinterpret_cast<T*>(tensor.data()), tensor_info);
-}
-
-template <typename T>
-std::unique_ptr<popart::NDArrayWrapper<T>> LoDTensor2IArray(
-    LoDTensor const& lod_tensor) {
-  if (lod_tensor.lod().size() == 0) {
-    return Tensor2IArray<T>(lod_tensor);
-  } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("LoDTensor2IArray is Unimplemented"));
-  }
-}
-
 template <typename T>
 T GetSingleVarFromScope(const Scope* scope, const std::string& var_name) {
   auto var = scope->GetVar(var_name);
   auto tensor = var->Get<framework::LoDTensor>();
-  // check dtype is  ?
   return tensor.data<T>()[0];
 }
 
-struct CustomOpAttrVisitor : public boost::static_visitor<void> {
-  explicit CustomOpAttrVisitor(std::map<std::string, popart::any>* attr,
-                               const std::string& attr_name)
-      : attrs_(attr), attr_name_(attr_name) {}
-  mutable std::map<std::string, popart::any>* attrs_;
-  std::string attr_name_;
-
-  void operator()(int v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(float v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(const std::string& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<int>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<float>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<std::string>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(bool v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(const std::vector<bool>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(BlockDesc* desc) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `BlockDesc` type."));
-  }
-  void operator()(const std::vector<BlockDesc*>& v) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `BlockDesc` type."));
-  }
-  void operator()(int64_t v) const { attrs_->emplace(attr_name_, v); }
-  void operator()(const std::vector<int64_t>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(const std::vector<double>& v) const {
-    attrs_->emplace(attr_name_, v);
-  }
-  void operator()(boost::blank) const {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Unsupported calling method for `boost::blank` type."));
-  }
-};
-
 struct IpuCustomOpIdentifier {
   IpuCustomOpIdentifier(const std::string& _paddle_op,
                         const std::string& _popart_op,
@@ -185,54 +62,44 @@ struct IpuCustomOpIdentifier {
   popart::OperatorIdentifier popart_op;
 };
 
-struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
-  explicit ConstantOpAttrVisitor(framework::LoDTensor* tensor,
-                                 framework::proto::VarType::Type dtype)
-      : tensor_(tensor), dtype_(dtype) {}
-  framework::LoDTensor* tensor_;
-  framework::proto::VarType::Type dtype_;
-
-  void operator()(const std::vector<int>& vec) const {
-    framework::TensorFromVector<int>(vec, tensor_);
-  }
-  void operator()(const std::vector<float>& vec) const {
-    if (dtype_ == framework::proto::VarType::FP16) {
-      std::vector<float16> vec_fp16;
-      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
-                     [](float f) -> float16 { return float16(f); });
-      framework::TensorFromVector<float16>(vec_fp16, tensor_);
-    } else {
-      framework::TensorFromVector<float>(vec, tensor_);
-    }
-  }
-  void operator()(const std::vector<bool>& vec) const {
-    framework::TensorFromVector<bool>(vec, tensor_);
-  }
-  void operator()(const std::vector<int64_t>& vec) const {
-    framework::TensorFromVector<int64_t>(vec, tensor_);
-  }
-  void operator()(const std::vector<double>& vec) const {
-    framework::TensorFromVector<double>(vec, tensor_);
-  }
-  void RaiseError() const {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("Constant value must be a vector"));
-  }
-  void operator()(int v) const { RaiseError(); }
-  void operator()(float v) const { RaiseError(); }
-  void operator()(const std::string& v) const { RaiseError(); }
-  void operator()(const std::vector<std::string>& v) const { RaiseError(); }
-  void operator()(bool v) const { RaiseError(); }
-  void operator()(BlockDesc* desc) const { RaiseError(); }
-  void operator()(const std::vector<BlockDesc*>& v) const { RaiseError(); }
-  void operator()(int64_t v) const { RaiseError(); }
-  void operator()(boost::blank) const { RaiseError(); }
+// Onnx dtype
+// https://github.com/onnx/onnx/blob/master/onnx/onnx-ml.proto3
+enum ONNXDataType : int {
+  UNDEFINED = 0,
+  FLOAT = 1,
+  UINT8 = 2,
+  INT8 = 3,
+  UINT16 = 4,
+  INT16 = 5,
+  INT32 = 6,
+  INT64 = 7,
+  STRING = 8,
+  BOOL = 9,
+  FLOAT16 = 10,
+  DOUBLE = 11,
+  UINT32 = 12,
+  UINT64 = 13,
+  COMPLEX64 = 14,
+  COMPLEX128 = 15,
+  BFLOAT16 = 16
 };
 
-std::vector<std::pair<std::string, std::string>> GetOptPrePostfix(
-    const std::string& opt_type);
-
-int RequestIpus(const int num_ipus);
+// VarType::Type to popart::DataType
+const popart::DataType VarType2PopartDType(const VarType::Type type);
+// phi::DataType to popart::DataType
+const popart::DataType PhiDType2PopartDType(const phi::DataType type);
+// popart::DataType to VarType::Type
+const VarType::Type PopartDType2VarType(const popart::DataType type);
+// ONNXDataType to popart::DataType
+const popart::DataType OnnxDType2PopartType(const ONNXDataType type);
+// VarType::Type to ONNXDataType
+const ONNXDataType VarType2OnnxDType(const VarType::Type type);
+// VarType::Type to String in Popart
+const std::string VarType2PopartStr(const VarType::Type type);
+// Get bool from envirnment varaible
+const bool GetBoolEnv(const std::string& str);
+// Request number of ipus must be pow(2, n)
+const int RequestIpus(const int num_ipus);
 
 }  // namespace ipu
 }  // namespace platform
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
index ab9ddfde21873..254e566567424 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -56,15 +56,15 @@ Node *gelu_handler(Graph *graph, Node *node) {
     auto sqrt2 = CreateConst(graph, node, {}, {},
                              {{"value", std::vector<float>{1.4142135623730951}},
                               {"dims", std::vector<int64_t>{1}},
-                              {"dtype", GetOutputVarDtype(node)}});
+                              {"dtype", GetOutputVarDType(node)}});
     auto zero_point_five =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0.5}},
                                           {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDtype(node)}});
+                                          {"dtype", GetOutputVarDType(node)}});
     auto one =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{1}},
                                           {"dims", std::vector<int64_t>{1}},
-                                          {"dtype", GetOutputVarDtype(node)}});
+                                          {"dtype", GetOutputVarDType(node)}});
     auto div =
         CreateBaseOp(graph, node, "popart_div",
                      {GetInputVarNode("X", node), sqrt2->outputs[0]}, {}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
index 3d22f75d345d6..7a14d23698def 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -18,7 +18,6 @@ namespace paddle {
 namespace platform {
 namespace ipu {
 
-// This avoids the static initialisation order fiasco,
 std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
   static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
   return symbol_handlers;
@@ -34,8 +33,6 @@ bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
   return new_handler;
 }
 
-// Return a pointer to a handler if one is registered for this kind of node or
-// an empty std::function otherwise.
 SymbolHandler GetHandler(const std::string &kind) {
   auto it = SymbolHandlers().find(kind);
   if (it != SymbolHandlers().end()) {
@@ -84,66 +81,6 @@ void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
   }
 }
 
-const int VarType2OnnxDtype(const int type) {
-  auto dtype = static_cast<framework::proto::VarType::Type>(type);
-  switch (dtype) {
-    case framework::proto::VarType::BOOL:
-      return static_cast<int>(ONNXDataType::BOOL);
-    case framework::proto::VarType::INT16:
-      return static_cast<int>(ONNXDataType::INT16);
-    case framework::proto::VarType::INT32:
-      return static_cast<int>(ONNXDataType::INT32);
-    case framework::proto::VarType::INT64:
-      return static_cast<int>(ONNXDataType::INT64);
-    case framework::proto::VarType::FP16:
-      return static_cast<int>(ONNXDataType::FLOAT16);
-    case framework::proto::VarType::FP32:
-      return static_cast<int>(ONNXDataType::FLOAT);
-    case framework::proto::VarType::FP64:
-      return static_cast<int>(ONNXDataType::DOUBLE);
-    case framework::proto::VarType::UINT8:
-      return static_cast<int>(ONNXDataType::UINT8);
-    case framework::proto::VarType::INT8:
-      return static_cast<int>(ONNXDataType::INT8);
-    case framework::proto::VarType::BF16:
-      return static_cast<int>(ONNXDataType::BFLOAT16);
-    case framework::proto::VarType::COMPLEX64:
-      return static_cast<int>(ONNXDataType::COMPLEX64);
-    case framework::proto::VarType::COMPLEX128:
-      return static_cast<int>(ONNXDataType::COMPLEX128);
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
-  }
-}
-
-const std::string VarType2PopStr(const int type) {
-  auto dtype = static_cast<framework::proto::VarType::Type>(type);
-  switch (dtype) {
-    case framework::proto::VarType::UINT8:
-      return "UINT8";
-    case framework::proto::VarType::INT8:
-      return "INT8";
-    case framework::proto::VarType::INT16:
-      return "INT16";
-    case framework::proto::VarType::INT32:
-      return "INT32";
-    case framework::proto::VarType::INT64:
-      return "INT64";
-    case framework::proto::VarType::BOOL:
-      return "BOOL";
-    case framework::proto::VarType::FP64:
-      return "DOUBLE";
-    case framework::proto::VarType::FP32:
-      return "FLOAT";
-    case framework::proto::VarType::FP16:
-      return "FLOAT16";
-    default:
-      PADDLE_THROW(
-          paddle::platform::errors::Unavailable("Unsupported data type."));
-  }
-}
-
 Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
                       const int id) {
   auto var_name = op_node->Op()->Input(input_name).at(id);
@@ -180,7 +117,7 @@ const bool is_float_equal(float a, float b, float eps) {
   return std::fabs(a - b) <= eps;
 }
 
-const int GetOutputVarDtype(const Node *node, const std::string &output_name) {
+const int GetOutputVarDType(const Node *node, const std::string &output_name) {
   auto out_node = GetOutputVarNode(output_name, node);
   PADDLE_ENFORCE_NOT_NULL(out_node, platform::errors::Unavailable(
                                         "Node's out node does not exist."));
@@ -188,7 +125,7 @@ const int GetOutputVarDtype(const Node *node, const std::string &output_name) {
   PADDLE_ENFORCE_NOT_NULL(
       var, platform::errors::Unavailable("Node is not a variable."));
   auto proto_var_type = var->GetDataType();
-  return VarType2OnnxDtype(proto_var_type);
+  return static_cast<int>(VarType2OnnxDType(proto_var_type));
 }
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
index 32133e128c588..7ac6097e0cc14 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -68,9 +68,6 @@ void ClearNode(Node *node);
 void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
                 bool override = false);
 
-const int VarType2OnnxDtype(const int type);
-const std::string VarType2PopStr(const int type);
-
 Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
                       const int id = 0);
 Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
@@ -81,7 +78,7 @@ Node *GetOutputVarNodeByVarName(const std::string &var_name,
                                 const Node *op_node);
 
 const bool is_float_equal(float a, float b, float eps = 1e-8);
-const int GetOutputVarDtype(const Node *node,
+const int GetOutputVarDType(const Node *node,
                             const std::string &output_name = "Out");
 
 }  // namespace ipu
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
index 6f82acb5b7db3..99fb76c950681 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -28,6 +28,14 @@ Node *equal_handler(Graph *graph, Node *node) {
   return new_node;
 }
 
+Node *not_equal_handler(Graph *graph, Node *node) {
+  auto equal_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, {});
+  return CreateBaseOp(graph, node, "popart_logical_not",
+                      {equal_node->outputs[0]}, node->outputs, {});
+}
+
 Node *logical_not_handler(Graph *graph, Node *node) {
   return CreateBaseOp(graph, node, "popart_logical_not",
                       {GetInputVarNode("X", node)},
@@ -64,6 +72,7 @@ Node *less_than_handler(Graph *graph, Node *node) {
 }  // namespace paddle
 
 REGISTER_HANDLER(equal, equal_handler);
+REGISTER_HANDLER(not_equal, not_equal_handler);
 REGISTER_HANDLER(logical_not, logical_not_handler);
 REGISTER_HANDLER(logical_or, logical_or_handler);
 REGISTER_HANDLER(logical_and, logical_and_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index 444b55959cf22..af72f84c9d771 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -41,7 +41,7 @@ Node *pow_handler(Graph *graph, Node *node) {
     // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
     auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
     auto attrs =
-        MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDtype(node));
+        MakeConstAttrMapFromValue<float>(value_, {1}, GetOutputVarDType(node));
 
     auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
     return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
@@ -134,7 +134,7 @@ Node *matmul_handler(Graph *graph, Node *node) {
   } else {
     auto o_node =
         CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
-    auto attr = MakeConstAttrMapFromValue(alpha, {1}, GetOutputVarDtype(node));
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, GetOutputVarDType(node));
     auto const_node = CreateConst(graph, node, {}, {}, attr);
     return CreateBaseOp(graph, node, "popart_mul",
                         {o_node->outputs[0], const_node->outputs[0]},
@@ -299,6 +299,80 @@ Node *cross_entropy2_handler(Graph *graph, Node *node) {
   }
 }
 
+Node *softmax_with_cross_entropy_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  auto soft_label = BOOST_GET_CONST(bool, op->GetAttr("soft_label"));
+  if (soft_label) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "soft_label is not supported yet in IPU"));
+  }
+  Node *new_cast = nullptr;
+  if (GetInputVarNode("Label", node)->Var()->GetDataType() ==
+      framework::proto::VarType::INT32) {
+    new_cast = GetInputVarNode("Label", node);
+  } else {
+    auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)},
+                               {}, framework::proto::VarType::INT32);
+    new_cast = new_cast->outputs[0];
+  }
+  auto softmax_node = CreateSoftmaxOpset11(
+      graph, node, {GetInputVarNode("Logits", node)}, {}, axis);
+
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_[label_shape_.size() - 1] != 1) {
+    auto log = CreateBaseOp(graph, node, "popart_log",
+                            {softmax_node->outputs[0]}, {}, {});
+    // softmax_with_cross_entropy is split to several ops in python.
+    // reduction is not needed here.
+    return CreateBaseOp(
+        graph, node, "popart_nllloss_v2", {log->outputs[0], new_cast},
+        {GetOutputVarNode("Loss", node)},
+        {
+            {"reduction", 2},  // popart::ReductionType::NoReduction
+            {"ignoreIndex", ignoreIndex},
+            {"inputIsLogProbability", true},
+        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {new_cast, const_before_loss->outputs[0]}, {}, {});
+
+    auto log = CreateBaseOp(graph, node, "popart_log",
+                            {softmax_node->outputs[0]}, {}, {});
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss_v2",
+        {log->outputs[0], reshape_before_loss->outputs[0]}, {},
+        {
+            {"reduction", 2},  // popart::ReductionType::NoReduction
+            {"ignoreIndex", ignoreIndex},
+            {"inputIsLogProbability", true},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Loss", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
 Node *cumsum_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto exclusive = BOOST_GET_CONST(bool, op->GetAttr("exclusive"));
@@ -378,6 +452,8 @@ REGISTER_HANDLER(matmul, matmul_handler);
 REGISTER_HANDLER(sum, sum_handler);
 REGISTER_HANDLER(softmax, softmax_handler);
 REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(softmax_with_cross_entropy,
+                 softmax_with_cross_entropy_handler);
 REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
 REGISTER_HANDLER(cumsum, cumsum_handler);
 REGISTER_HANDLER(matmul_v2, matmul_v2_handler);
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index a08fbaa26d9ed..2e9913f58efbb 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -299,7 +299,7 @@ Node *dropout_handler(Graph *graph, Node *node) {
           CreateConst(graph, node, {}, {},
                       {{"value", std::vector<float>{1 - dropout_prob_}},
                        {"dims", std::vector<int64_t>{1}},
-                       {"dtype", GetOutputVarDtype(node)}});
+                       {"dtype", GetOutputVarDType(node)}});
       return CreateBaseOp(graph, node, "popart_mul",
                           {GetInputVarNode("X", node), scale->outputs[0]},
                           {GetOutputVarNode("Out", node)}, {});
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
index 0339097d58790..0525bb66f1618 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -124,7 +124,7 @@ Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
 
 Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
                  const std::vector<Node *> &outputs, const int otype) {
-  auto to = VarType2PopStr(otype);
+  auto to = VarType2PopartStr(static_cast<VarType::Type>(otype));
   return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
                       {{"to", to}});
 }
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
index de3788e437a42..f096beb9c4d77 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -17,8 +17,8 @@
 #include "paddle/fluid/platform/device/ipu/ipu_names.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
 
-using paddle::framework::AttributeMap;
-using paddle::framework::Attribute;
+using AttributeMap = paddle::framework::AttributeMap;
+using Attribute = paddle::framework::Attribute;
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 55c25bce15931..00926ee7a0b25 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -23,12 +23,14 @@ namespace {
 
 Node *fill_constant_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (!op->Input("ShapeTensor").empty()) {
+  auto op_inputs = op->Inputs();
+  if (op_inputs.find("ShapeTensor") != op_inputs.end() &&
+      !op->Input("ShapeTensor").empty()) {
     PADDLE_THROW(
         platform::errors::Unimplemented("op fill_constant with ShapeTensor"));
   }
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto dims = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto value_ = BOOST_GET_CONST(float, op->GetAttr("value"));
   size_t size = 1;
@@ -37,19 +39,20 @@ Node *fill_constant_handler(Graph *graph, Node *node) {
   }
   Attribute value;
   switch (dtype_) {
-    case framework::proto::VarType::FP32:
+    case VarType::FP16:
+    case VarType::FP32:
       value = std::vector<float>(size, value_);
       break;
-    case framework::proto::VarType::FP64:
+    case VarType::FP64:
       value = std::vector<double>(size, value_);
       break;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       value = std::vector<int>(size, value_);
       break;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       value = std::vector<int64_t>(size, value_);
       break;
-    case framework::proto::VarType::BOOL:
+    case VarType::BOOL:
       value = std::vector<bool>(size, value_);
       break;
     default:
@@ -66,7 +69,7 @@ Node *gaussian_random_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto shape = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto mean = BOOST_GET_CONST(float, op->GetAttr("mean"));
   auto scale = BOOST_GET_CONST(float, op->GetAttr("std"));
   // seed not work
@@ -86,7 +89,7 @@ Node *uniform_random_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto shape = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr("shape"));
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto high = BOOST_GET_CONST(float, op->GetAttr("max"));
   auto low = BOOST_GET_CONST(float, op->GetAttr("min"));
   // seed not work
@@ -172,9 +175,21 @@ Node *squeeze_handler(Graph *graph, Node *node) {
 Node *cast_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto otype = BOOST_GET_CONST(int, op->GetAttr("out_dtype"));
-  auto new_node_cast =
-      CreateCast(graph, node, node->inputs, node->outputs, otype);
-  return new_node_cast;
+  auto new_node = CreateCast(graph, node, node->inputs, node->outputs, otype);
+  // Cast op created in mixed-precison has no pipline attrs
+  auto &prev_nodes = node->inputs.front()->inputs;
+  if (!prev_nodes.empty()) {
+    auto *prev_op = prev_nodes.front()->Op();
+    if (!new_node->Op()->HasAttr(sIpuIndexAttr) &&
+        prev_op->HasAttr(sIpuIndexAttr)) {
+      CopyOpAttr(sIpuIndexAttr, prev_op, new_node->Op());
+    }
+    if (!new_node->Op()->HasAttr(sIpuStageAttr) &&
+        prev_op->HasAttr(sIpuStageAttr)) {
+      CopyOpAttr(sIpuStageAttr, prev_op, new_node->Op());
+    }
+  }
+  return new_node;
 }
 
 Node *lookup_table_op_handler(Graph *graph, Node *node,
@@ -192,7 +207,7 @@ Node *lookup_table_op_handler(Graph *graph, Node *node,
     auto concat_const =
         CreateConst(graph, node, {}, {}, {{"value", const_value_},
                                           {"dims", const_shape_},
-                                          {"dtype", GetOutputVarDtype(node)}});
+                                          {"dtype", GetOutputVarDType(node)}});
     auto axes =
         CreateConst(graph, node, {}, {}, {{"value", std::vector<int64_t>{0}},
                                           {"dims", std::vector<int64_t>{1}},
@@ -397,7 +412,7 @@ Node *expand_handler(Graph *graph, Node *node) {
     // cast to int64
     expand_times =
         CreateCast(graph, node, {GetInputVarNode("ExpandTimes", node)}, {},
-                   framework::proto::VarType::INT64);
+                   VarType::INT64);
   } else {
     auto expand_times_i32 =
         BOOST_GET_CONST(std::vector<int>, op->GetAttr("expand_times"));
@@ -423,27 +438,28 @@ Node *assign_handler(Graph *graph, Node *node) {
 Node *assign_value_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto dtype = VarType2OnnxDtype(dtype_);
+  auto dtype = VarType2OnnxDType(static_cast<VarType::Type>(dtype_));
   auto dims_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("shape"));
   std::vector<int64_t> dims(dims_.begin(), dims_.end());
   Attribute values;
   std::string value_name;
   switch (dtype_) {
-    case framework::proto::VarType::BOOL: {
+    case VarType::BOOL: {
       value_name = "bool_values";
       auto vec_int = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
       std::vector<bool> vec_bool(vec_int.begin(), vec_int.end());
       values = vec_bool;
     } break;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       value_name = "int32_values";
       values = BOOST_GET_CONST(std::vector<int>, op->GetAttr(value_name));
       break;
-    case framework::proto::VarType::FP32:
+    case VarType::FP16:
+    case VarType::FP32:
       value_name = "fp32_values";
       values = BOOST_GET_CONST(std::vector<float>, op->GetAttr(value_name));
       break;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       value_name = "int64_values";
       values = BOOST_GET_CONST(std::vector<int64_t>, op->GetAttr(value_name));
       break;
@@ -463,39 +479,40 @@ Node *fill_any_like_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto value = BOOST_GET_CONST(float, op->GetAttr("value"));
   auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
-  auto dtype = BOOST_GET_CONST(int, op->GetAttr("dtype"));
-  auto x_dtype = static_cast<framework::proto::VarType::Type>(dtype);
+  auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype"));
+  auto dtype = static_cast<VarType::Type>(dtype_);
   size_t size = 1;
   for (auto &dim : x_shape) {
     size *= dim;
   }
 
   Attribute out_value;
-  switch (x_dtype) {
-    case framework::proto::VarType::FP32:
+  switch (dtype) {
+    case VarType::FP16:
+    case VarType::FP32:
       out_value = std::vector<float>(size, value);
       break;
-    case framework::proto::VarType::FP64:
+    case VarType::FP64:
       out_value = std::vector<double>(size, value);
       break;
-    case framework::proto::VarType::INT32:
+    case VarType::INT32:
       out_value = std::vector<int>(size, value);
       break;
-    case framework::proto::VarType::INT64:
+    case VarType::INT64:
       out_value = std::vector<int64_t>(size, value);
       break;
-    case framework::proto::VarType::BOOL:
+    case VarType::BOOL:
       out_value = std::vector<int64_t>(size, value);
       break;
     default:
       PADDLE_THROW(
-          platform::errors::Unimplemented("fill_any_like dtype: %d", x_dtype));
+          platform::errors::Unimplemented("fill_any_like dtype: %d", dtype));
   }
   return CreateConst(graph, node, node->inputs, node->outputs,
                      AttributeMap{
                          {"value", out_value},
                          {"dims", x_shape},
-                         {"dtype", VarType2OnnxDtype(dtype)},
+                         {"dtype", VarType2OnnxDType(dtype)},
                      });
 }
 
@@ -538,8 +555,7 @@ Node *one_hot_v2_handler(Graph *graph, Node *node) {
                                           {"dims", std::vector<int64_t>{1}},
                                           {"dtype", ONNXDataType::INT32}});
     Node *value_tensor = nullptr;
-    if (GetOutputVarNode("Out", node)->Var()->GetDataType() ==
-        framework::proto::VarType::FP16) {
+    if (GetOutputVarNode("Out", node)->Var()->GetDataType() == VarType::FP16) {
       value_tensor =
           CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{0, 1}},
                                             {"dims", std::vector<int64_t>{2}},
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 6a58f7890f9fa..2e960c1c0dd9c 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -54,7 +54,10 @@ std::vector<int> GetXPUSelectedDevices() {
 
 void MemcpySyncH2D(void* dst, const void* src, size_t count,
                    const platform::XPUPlace& dst_place) {
-  phi::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(dst_place);
+  dev_ctx->Wait();
+  phi::backends::xpu::MemcpySyncH2D(dst, src, count, dst_place, *dev_ctx);
 }
 
 void MemcpySyncD2H(void* dst, const void* src, size_t count,
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 99a1eb97de50a..43c9e63ac194b 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -42,6 +42,8 @@ XPUOpMap& get_kp_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"elementwise_floordiv",
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"elementwise_pow",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       // activation op
       {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -105,6 +107,8 @@ XPUOpMap& get_kp_ops() {
       {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
index ea7c502e3e681..998437997547b 100644
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -24,13 +24,14 @@ namespace dynload {
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
 
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
 #ifdef CUSPARSE_ROUTINE_EACH_11020
 CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
 #endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c7a6bdc3cefae..772a7750fe90d 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -106,9 +106,6 @@ namespace phi {
 class ErrorSummary;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
 DECLARE_int32(call_stack_level);
 
 namespace paddle {
@@ -539,7 +536,7 @@ inline void retry_sleep(unsigned milliseconds) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);    \
+      paddle::platform::retry_sleep(10000);                             \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
@@ -727,7 +724,7 @@ inline void retry_sleep(unsigned millisecond) {
         ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      ::paddle::platform::retry_sleep(FLAGS_gpu_allocator_retry_time);  \
+      ::paddle::platform::retry_sleep(10000);                           \
       __cond__ = (COND);                                                \
       ++retry_count;                                                    \
     }                                                                   \
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index ab8bf0529dcfc..6636fc8aca51d 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -258,13 +258,13 @@ void BindDistributed(py::module *m) {
 #else
                     const platform::CUDAPlace &,
 #endif
-                    int, int, int, int, int, bool, std::string>(),
+                    int, int, int, int, int, bool, std::string, int, int>(),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
            py::arg("place"), py::arg("gid") = 0, py::arg("local_rank") = 0,
            py::arg("local_size") = 1, py::arg("gloo_rank") = 0,
            py::arg("gloo_size") = 1, py::arg("with_switch") = false,
-           py::arg("switch_endpoint") = "",
-           py::call_guard<py::gil_scoped_release>());
+           py::arg("switch_endpoint") = "", py::arg("src_rank") = "",
+           py::arg("dst_rank") = "", py::call_guard<py::gil_scoped_release>());
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 4d7b50943d084..ac33eb2359c8c 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -119,8 +119,7 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
   auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
   egr::Backward(tensors, grad_tensors,
                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -159,8 +158,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
       egr::EagerUtils::autograd_meta(&(src))->StopGradient());
   egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
       egr::EagerUtils::autograd_meta(&(src))->Persistable());
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -406,12 +404,9 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
 
       if (slot_map[0].find(i) != slot_map[0].end()) {
         grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
-        grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]);
       } else {
         grad_node->SetGradOutMeta(in_tensors,
                                   ins_auto_grad_metas.size() - 1 - no_grad_cnt);
-        grad_node->AddEdges(&ins_auto_grad_metas[i],
-                            ins_auto_grad_metas.size() - 1 - no_grad_cnt);
         no_grad_cnt++;
       }
     }
@@ -458,8 +453,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
     grad_node->SetAttrs(attrs);
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -691,8 +685,7 @@ static PyObject* eager_api_async_read(PyObject* self, PyObject* args,
   cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data<float>(),
                   index_tensor.numel() * size * sizeof(float),
                   cudaMemcpyHostToDevice, stream);
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -774,8 +767,7 @@ static PyObject* eager_api_async_write(PyObject* self, PyObject* args,
                     cudaMemcpyDeviceToHost, stream);
     src_offset += c;
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index e6bd1c0b52682..d3393b7cb57ac 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -267,8 +267,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Tensor.numpy() only support cpu tensor."));
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
 
   return array;
@@ -335,8 +334,7 @@ static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self,
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "StringTensor.numpy() only support cpu tensor."));
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -405,8 +403,8 @@ static PyObject* tensor_method_reconstruct_from_(TensorObject* self,
 
   VLOG(6) << "Finished Reconstructing Tensor from" << src_tensor.name()
           << " to " << self->tensor.name();
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -436,8 +434,8 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
 
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -453,8 +451,8 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args,
     }
     egr::egr_utils_api::RetainGradForTensor(self->tensor);
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -505,8 +503,8 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args,
     }
   }
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -535,8 +533,8 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args,
     }
   }
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -559,8 +557,8 @@ static PyObject* tensor__share_buffer_to(TensorObject* self, PyObject* args,
       static_cast<paddle::framework::Tensor*>(dst_ptr->impl().get());
   dst_tensor->ShareBufferWith(*src_tensor);
   dst_tensor->ShareDataTypeWith(*src_tensor);
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -600,8 +598,8 @@ static PyObject* tensor__share_underline_tensor_to(TensorObject* self,
                         "src tensor before share_buffer_with to other.",
                         self->tensor.name()));
   src_ptr->set_impl(self->tensor.impl());
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -656,8 +654,7 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
                                                     PyObject* kwargs) {
   EAGER_TRY
   if (!self->tensor.defined()) {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   if (self->tensor.is_dense_tensor()) {
     auto* tensor =
@@ -665,8 +662,7 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
     VLOG(6) << "tensor: " << tensor->IsInitialized();
     return ToPyObject(tensor);
   } else {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -676,16 +672,14 @@ static PyObject* tensor_method_get_underline_selected_rows(TensorObject* self,
                                                            PyObject* kwargs) {
   EAGER_TRY
   if (!self->tensor.defined()) {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   if (self->tensor.is_selected_rows()) {
     auto* selected_rows =
         static_cast<phi::SelectedRows*>(self->tensor.impl().get());
     return ToPyObject(selected_rows);
   } else {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -1110,8 +1104,8 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
                            false);
     }
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1202,8 +1196,8 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args,
   accumulation_grad_node->RegisterReduceHook(
       std::make_shared<PyTensorVoidHook>(hook_func));
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1218,7 +1212,8 @@ static PyObject* tensor__set_grad_type(TensorObject* self, PyObject* args,
   } else if (var_type == framework::proto::VarType::SELECTED_ROWS) {
     grad_tensor->set_impl(std::make_shared<phi::SelectedRows>());
   }
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1226,7 +1221,8 @@ static PyObject* tensor__clear(TensorObject* self, PyObject* args,
                                PyObject* kwargs) {
   EAGER_TRY
   self->tensor.reset();
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1254,8 +1250,8 @@ static PyObject* tensor__copy_gradient_from(TensorObject* self, PyObject* args,
                           "Tensor %s has not been initialized", src.name()));
     p_grad->set_impl(src.impl());
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 static PyObject* tensor_method_get_non_zero_indices(TensorObject* self,
@@ -1396,7 +1392,7 @@ static PyObject* tensor__bump_inplace_version(TensorObject* self,
                                               PyObject* kwargs) {
   EAGER_TRY
   self->tensor.bump_inplace_version();
-  return Py_None;
+  RETURN_PY_NONE
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1446,8 +1442,8 @@ static PyObject* tensor__reset_grad_inplace_version(TensorObject* self,
       grad->initialized()) {
     grad->reset_inplace_version(set_to_zero);
   }
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -1479,8 +1475,8 @@ static PyObject* tensor_method__share_memory(TensorObject* self, PyObject* args,
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
       "Sharing memory in Windows OS is not supported currently"));
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
 #endif
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -1522,8 +1518,7 @@ static PyObject* tensor__grad_value(TensorObject* self, PyObject* args,
                         "cleared the grad inside autograd_meta"));
 
   if (!grad->defined()) {
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   if (grad->is_dense_tensor()) {
     auto* grad_tensor =
@@ -1532,8 +1527,7 @@ static PyObject* tensor__grad_value(TensorObject* self, PyObject* args,
   } else {
     PADDLE_THROW(paddle::platform::errors::Fatal(
         "this method is only supported for DenseTensor"));
-    Py_IncRef(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -1556,8 +1550,8 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
       static_cast<paddle::framework::LoDTensor*>(self->tensor.impl().get());
   tensor_uva(self_tensor, device_id);
 
-  Py_INCREF(Py_None);
-  return Py_None;
+  RETURN_PY_NONE
+
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 #endif
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 2ac12165c1a66..b546aa2d76bcd 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -36,6 +36,11 @@
 // phi
 #include "paddle/phi/kernels/declarations.h"
 
+static std::string LegalizeVarName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
+  return ret;
+}
 // clang-format off
 const char* OUT_INITIALIZER_TEMPLATE =
     R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
@@ -185,18 +190,19 @@ std::string GenerateOpFunctionsBody(
       continue;
     }
     const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
-    auto input_arg =
-        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    auto input_arg = paddle::string::Sprintf(
+        ARG_TEMPLATE, in_type, TempName(LegalizeVarName(in_name)));
     input_args += input_arg;
     input_args += ",";
     input_args_num++;
     const auto in_cast_type =
         input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
     auto dispensable = input.dispensable() ? "true" : "false";
-    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
-                                            in_name, arg_idx++, dispensable);
+    ins_cast_str +=
+        paddle::string::Sprintf(in_cast_type, LegalizeVarName(in_name), op_type,
+                                in_name, arg_idx++, dispensable);
 
-    call_api_str += in_name + ", ";
+    call_api_str += LegalizeVarName(in_name) + ", ";
   }
 
   if (!input_args.empty() && input_args.back() == ',') {
@@ -224,7 +230,7 @@ std::string GenerateOpFunctionsBody(
         input_args += ",";
       }
       input_args += out_type;
-      input_args += out_name;
+      input_args += LegalizeVarName(out_name);
       input_args_num++;
 
       if (output.dispensable()) {
@@ -237,18 +243,19 @@ std::string GenerateOpFunctionsBody(
         const auto out_template = output.duplicable()
                                       ? INPUT_LIST_INITIALIZER_TEMPLATE
                                       : INPUT_INITIALIZER_TEMPLATE;
-        outs_initializer +=
-            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += paddle::string::Sprintf(out_template, out_name,
+                                                    LegalizeVarName(out_name));
         outs_initializer += ",";
       }
 
       const auto in_cast_type = output.duplicable() ? CAST_VAR_PTR_LIST_TEMPLATE
                                                     : CAST_VAR_PTR_TEMPLATE;
       auto dispensable = output.dispensable() ? "true" : "false";
-      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
-                                              out_name, arg_idx++, dispensable);
+      ins_cast_str +=
+          paddle::string::Sprintf(in_cast_type, LegalizeVarName(out_name),
+                                  op_type, out_name, arg_idx++, dispensable);
 
-      call_api_str += out_name + ", ";
+      call_api_str += LegalizeVarName(out_name) + ", ";
     } else {
       // There are few Operators that have duplicable output, like `Out` in
       // split op. We need to specify the number of variables for the
@@ -257,7 +264,8 @@ std::string GenerateOpFunctionsBody(
         if (input_args != "") {
           input_args += ",";
         }
-        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        auto out_num_str =
+            paddle::string::Sprintf(ARG_OUT_NUM, LegalizeVarName(out_name));
         input_args += ARG_OUT_NUM_TYPE;
         input_args += out_num_str;
         input_args_num++;
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index de66308a7baf6..7af221b9ac82e 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -52,8 +52,7 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
   } else if (self->tensor.is_selected_rows()) {
     return ToPyObject(paddle::framework::proto::VarType::SELECTED_ROWS);
   } else {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
@@ -87,8 +86,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) {
   if (meta && meta->Grad().initialized()) {
     return ToPyObject(meta->Grad());
   } else {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index 605056e7af2b5..47a5309d691f5 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -346,10 +346,8 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
         for (auto t : inputs_tensor[i]) {
           grad_node->SetGradOutMeta(*t, i);
         }
-        grad_node->AddEdges(&inputs_autograd_meta[i], i);
       } else {
         grad_node->SetGradOutMeta(*inputs_tensor[i][0], i);
-        grad_node->AddEdges(inputs_autograd_meta[i][0], i);
       }
     }
 
@@ -392,8 +390,7 @@ PyObject* pylayer_method_register_hook(PyObject* _self, PyObject* hook) {
 PyObject* tensor_properties_get_container(PyLayerObject* self, void* closure) {
   EAGER_TRY
   if (self->container == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE;
   }
   Py_INCREF(self->container);
   return self->container;
@@ -414,8 +411,7 @@ PyObject* tensor_properties_get_non_differentiable(PyLayerObject* self,
                                                    void* closure) {
   EAGER_TRY
   if (self->non_differentiable == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE;
   }
   Py_INCREF(self->non_differentiable);
   return self->non_differentiable;
@@ -436,8 +432,7 @@ PyObject* tensor_properties_get_dirty_tensors(PyLayerObject* self,
                                               void* closure) {
   EAGER_TRY
   if (self->dirty_tensors == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE;
   }
   Py_INCREF(self->dirty_tensors);
   return self->dirty_tensors;
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index d07cbd5ee21a2..90d7024f7a746 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -516,8 +516,7 @@ PyObject* ToPyObject(const std::string& value) {
 PyObject* ToPyObject(const paddle::experimental::Tensor& value,
                      bool return_py_none_if_not_initialize) {
   if (return_py_none_if_not_initialize && !value.initialized()) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   PyObject* obj = nullptr;
   if (value.initialized() && value.is_string_tensor()) {
@@ -679,8 +678,7 @@ PyObject* ToPyObject(const phi::SelectedRows* value) {
 
 PyObject* ToPyObject(const void* value) {
   if (value == nullptr) {
-    Py_INCREF(Py_None);
-    return Py_None;
+    RETURN_PY_NONE
   }
   PADDLE_THROW(
       platform::errors::Fatal("ToPyObject do not support void* with value."));
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index c4ddb34763228..5273433208d11 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -31,6 +31,10 @@ class Scope;
 }
 namespace pybind {
 
+#define RETURN_PY_NONE \
+  Py_INCREF(Py_None);  \
+  return Py_None;
+
 int TensorDtype2NumpyDtype(phi::DataType dtype);
 
 bool IsEagerTensor(PyObject* obj);
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 4df43dc1a3a52..bcf55e46edb76 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -325,14 +325,18 @@ void BindNeighborSampleResult(py::module* m) {
   py::class_<NeighborSampleResult>(*m, "NeighborSampleResult")
       .def(py::init<>())
       .def("initialize", &NeighborSampleResult::initialize)
+      .def("get_len", &NeighborSampleResult::get_len)
+      .def("get_val", &NeighborSampleResult::get_actual_val)
       .def("display", &NeighborSampleResult::display);
 }
 
 void BindGraphGpuWrapper(py::module* m) {
   py::class_<GraphGpuWrapper>(*m, "GraphGpuWrapper")
-      .def(py::init<>())
+      // nit<>())
       //.def("test", &GraphGpuWrapper::test)
-      .def("initialize", &GraphGpuWrapper::initialize)
+      //.def(py::init([]() { return framework::GraphGpuWrapper::GetInstance();
+      //}))
+      .def(py::init<>())
       .def("neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample_v3)
       .def("graph_neighbor_sample", &GraphGpuWrapper::graph_neighbor_sample)
       .def("set_device", &GraphGpuWrapper::set_device)
@@ -342,6 +346,15 @@ void BindGraphGpuWrapper(py::module* m) {
       .def("add_table_feat_conf", &GraphGpuWrapper::add_table_feat_conf)
       .def("load_edge_file", &GraphGpuWrapper::load_edge_file)
       .def("upload_batch", &GraphGpuWrapper::upload_batch)
+      .def("get_all_id", &GraphGpuWrapper::get_all_id)
+      .def("load_next_partition", &GraphGpuWrapper::load_next_partition)
+      .def("make_partitions", &GraphGpuWrapper::make_partitions)
+      .def("make_complementary_graph",
+           &GraphGpuWrapper::make_complementary_graph)
+      .def("set_search_level", &GraphGpuWrapper::set_search_level)
+      .def("init_search_level", &GraphGpuWrapper::init_search_level)
+      .def("get_partition_num", &GraphGpuWrapper::get_partition_num)
+      .def("get_partition", &GraphGpuWrapper::get_partition)
       .def("load_node_file", &GraphGpuWrapper::load_node_file);
 }
 #endif
diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h
index 7b9379df6be2c..5a5650e75665c 100644
--- a/paddle/fluid/pybind/op_function.h
+++ b/paddle/fluid/pybind/op_function.h
@@ -177,7 +177,7 @@ static inline void HandleViewBetweenInputAndOutput(
   }
 }
 
-PyObject* MakeReturnPyObject(
+static inline PyObject* MakeReturnPyObject(
     const std::shared_ptr<paddle::imperative::VarBase>& out) {
   return ::pybind11::detail::type_caster_base<imperative::VarBase>::cast_holder(
              ::pybind11::detail::holder_helper<
@@ -186,7 +186,7 @@ PyObject* MakeReturnPyObject(
       .ptr();
 }
 
-PyObject* MakeReturnPyObject(
+static inline PyObject* MakeReturnPyObject(
     const std::vector<std::shared_ptr<imperative::VarBase>>& out) {
   PyObject* result = PyList_New((Py_ssize_t)out.size());
 
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 5eed63d0800b3..0e9c08cff2859 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -282,6 +282,7 @@ std::vector<int> CastPyArg2Ints(PyObject* obj, const std::string& op_type,
   std::vector<int> value;
   if (PyList_Check(obj)) {
     Py_ssize_t len = PyList_Size(obj);
+    value.reserve(len);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyList_GetItem(obj, i);
@@ -298,6 +299,7 @@ std::vector<int> CastPyArg2Ints(PyObject* obj, const std::string& op_type,
     }
   } else if (PyTuple_Check(obj)) {
     Py_ssize_t len = PyTuple_Size(obj);
+    value.reserve(len);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyTuple_GetItem(obj, i);
@@ -314,6 +316,7 @@ std::vector<int> CastPyArg2Ints(PyObject* obj, const std::string& op_type,
     }
   } else if (PySequence_Check(obj)) {
     Py_ssize_t len = PySequence_Size(obj);
+    value.reserve(len);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PySequence_GetItem(obj, i);
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 9d5bcfac494cb..a905c5befc2b0 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -35,6 +35,12 @@
 // phi
 #include "paddle/phi/kernels/declarations.h"
 
+static std::string LegalizeVarName(const std::string& var_name) {
+  std::string ret = var_name;
+  std::replace(ret.begin(), ret.end(), '@', '_');  // replace all '-' to '_'
+  return ret;
+}
+
 // NOTE(pangyoki): Inplace OP with duplicable input.
 // The set includes inplace ops that have duplicable input.
 // The first Varbase in input needs to be specified for the inplace strategy
@@ -81,13 +87,13 @@ const char* OUT_VAR_TYPE = R"(std::shared_ptr<imperative::VarBase>)";
 const char* OUT_VAR_LIST_TYPE = R"(std::vector<std::shared_ptr<imperative::VarBase>>)";
 
 const char* CAST_VAR_TEMPLATE = R"(
-    auto %s = GetVarBaseFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetVarBaseFromArgs(op_type, "%s", args, %d, %s);)";
 
 const char* CAST_VAR_LIST_TEMPLATE = R"(
-    auto %s = GetVarBaseListFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetVarBaseListFromArgs(op_type, "%s", args, %d, %s);)";
 
 const char* CAST_SIZE_T_TEMPLATE = R"(
-    auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)";
+    auto %s = GetUnsignedLongFromArgs(op_type, "%s", args, %d, %s);)";
 
 const char* ARG_TEMPLATE = R"(const %s& %s)";
 
@@ -126,16 +132,17 @@ static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs)
   PyThreadState *tstate = nullptr;
   try
   {
+    std::string op_type = "%s";
     platform::RecordEvent op_type_record_event("%s pybind_imperative_func");
     %s
     framework::AttributeMap attrs;
-    ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs);
+    ConstructAttrMapFromPyArgs(op_type, args, %d, PyTuple_GET_SIZE(args) , attrs);
     tstate = PyEval_SaveThread();
     %s
     imperative::NameVarBaseMap outs = %s;
     imperative::NameVarBaseMap ins = %s;
     %s
-    imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s});
+    imperative::GetCurrentTracer()->TraceOp(op_type, ins, outs, attrs, {%s});
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
     %s
@@ -200,28 +207,31 @@ std::string GenerateOpFunctionsBody(
       continue;
     }
     const auto in_type = input.duplicable() ? IN_VAR_LIST_TYPE : IN_VAR_TYPE;
-    auto input_arg =
-        paddle::string::Sprintf(ARG_TEMPLATE, in_type, TempName(in_name));
+    auto input_arg = paddle::string::Sprintf(
+        ARG_TEMPLATE, in_type, LegalizeVarName(TempName(in_name)));
     input_args += input_arg;
     input_args += ",";
     input_args_num++;
     const auto in_cast_type =
         input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
     auto dispensable = input.dispensable() ? "true" : "false";
-    ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type,
-                                            in_name, arg_idx++, dispensable);
+    ins_cast_str +=
+        paddle::string::Sprintf(in_cast_type, LegalizeVarName(in_name), in_name,
+                                arg_idx++, dispensable);
 
     if (input.dispensable()) {
       const auto in_template = input.duplicable()
                                    ? INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST
                                    : INPUT_INITIALIZER_TEMPLATE_WITH_NULL;
       ins_initializer_with_null +=
-          paddle::string::Sprintf(in_template, in_name, in_name, in_name);
+          paddle::string::Sprintf(in_template, LegalizeVarName(in_name),
+                                  in_name, LegalizeVarName(in_name));
     } else {
       const auto in_template = input.duplicable()
                                    ? INPUT_LIST_INITIALIZER_TEMPLATE
                                    : INPUT_INITIALIZER_TEMPLATE;
-      ins_initializer += paddle::string::Sprintf(in_template, in_name, in_name);
+      ins_initializer += paddle::string::Sprintf(in_template, in_name,
+                                                 LegalizeVarName(in_name));
       ins_initializer += ",";
     }
   }
@@ -258,7 +268,7 @@ std::string GenerateOpFunctionsBody(
         input_args += ",";
       }
       input_args += out_type;
-      input_args += out_name;
+      input_args += LegalizeVarName(out_name);
       input_args_num++;
 
       if (output.dispensable()) {
@@ -271,16 +281,17 @@ std::string GenerateOpFunctionsBody(
         const auto out_template = output.duplicable()
                                       ? INPUT_LIST_INITIALIZER_TEMPLATE
                                       : INPUT_INITIALIZER_TEMPLATE;
-        outs_initializer +=
-            paddle::string::Sprintf(out_template, out_name, out_name);
+        outs_initializer += paddle::string::Sprintf(out_template, out_name,
+                                                    LegalizeVarName(out_name));
         outs_initializer += ",";
       }
 
       const auto in_cast_type =
           output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE;
       auto dispensable = output.dispensable() ? "true" : "false";
-      ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type,
-                                              out_name, arg_idx++, dispensable);
+      ins_cast_str +=
+          paddle::string::Sprintf(in_cast_type, LegalizeVarName(out_name),
+                                  out_name, arg_idx++, dispensable);
     } else if (use_inplace_strategy && inplace_map.count(out_name)) {
       PADDLE_ENFORCE_NE(
           inplace_map[out_name], "",
@@ -306,11 +317,13 @@ std::string GenerateOpFunctionsBody(
       // Leaf Var that doesn't stop gradient can't use inplace strategy.
       // Increase inplace_version.
       inplace_strategy_str += paddle::string::Sprintf(
-          INPLACE_STRATEGY_TEMPLATE, inplace_input_name, inplace_input_name,
-          INPLACE_LEAF_ERROR_MESSAGE, inplace_input_name, inplace_input_name,
-          inplace_input_name);
-      outs_initializer +=
-          paddle::string::Sprintf(out_template, out_name, inplace_input_name);
+          INPLACE_STRATEGY_TEMPLATE, LegalizeVarName(inplace_input_name),
+          LegalizeVarName(inplace_input_name), INPLACE_LEAF_ERROR_MESSAGE,
+          LegalizeVarName(inplace_input_name),
+          LegalizeVarName(inplace_input_name),
+          LegalizeVarName(inplace_input_name));
+      outs_initializer += paddle::string::Sprintf(
+          out_template, out_name, LegalizeVarName(inplace_input_name));
       outs_initializer += ",";
     } else {
       // There are few Operators that have duplicable output, like `Out` in
@@ -320,7 +333,8 @@ std::string GenerateOpFunctionsBody(
         if (input_args != "") {
           input_args += ",";
         }
-        auto out_num_str = paddle::string::Sprintf(ARG_OUT_NUM, out_name);
+        auto out_num_str =
+            paddle::string::Sprintf(ARG_OUT_NUM, LegalizeVarName(out_name));
         input_args += ARG_OUT_NUM_TYPE;
         input_args += out_num_str;
         input_args_num++;
@@ -329,7 +343,7 @@ std::string GenerateOpFunctionsBody(
 
         auto dispensable = output.dispensable() ? "true" : "false";
         ins_cast_str +=
-            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type,
+            paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str,
                                     out_num_str, arg_idx++, dispensable);
       } else {
         outs_initializer +=
@@ -358,7 +372,7 @@ std::string GenerateOpFunctionsBody(
         viwe_input_name, viwe_output_name);
   }
   if (outs_num == 0) {
-    return_str = "Py_INCREF(Py_None);\n    return Py_None;";
+    return_str = "RETURN_PY_NONE";
   } else if (outs_num == 1) {
     return_str = "return MakeReturnPyObject(" + return_str + ");";
   } else {
@@ -375,11 +389,11 @@ std::string GenerateOpFunctionsBody(
 
   // generate op funtcion body
   auto op_function_str = paddle::string::Sprintf(
-      OP_FUNCTION_TEMPLATE, func_name, op_type, ins_cast_str, op_type,
+      OP_FUNCTION_TEMPLATE, func_name, op_type, op_type, ins_cast_str,
       input_args_num, inplace_strategy_str, outs_initializer, ins_initializer,
       ins_initializer_with_null + outs_initializer_with_null +
           view_strategy_str,
-      op_type, inplace_mapping_str, return_str);
+      inplace_mapping_str, return_str);
 
   return op_function_str;
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3a242fe2582a5..602a0345b04fe 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3022,6 +3022,10 @@ All parameter, weight, gradient are variables in Paddle.
     // Only GPUs with Compute Capability >= 53 support float16
     return platform::GetGPUComputeCapability(place.device) >= 53;
   });
+  m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 80 support bfloat16
+    return platform::GetGPUComputeCapability(place.device) >= 80;
+  });
 #endif
 
   m.def("set_feed_variable",
@@ -4353,7 +4357,10 @@ All parameter, weight, gradient are variables in Paddle.
              for (auto element : opt) {
                auto option_name = element.first.cast<std::string>();
                VLOG(10) << "Set option: " << option_name;
-               if (py::isinstance<py::bool_>(element.second)) {
+               if (option_name == "compilation_progress_logger") {
+                 self.SetCompilationProgressLogger(
+                     element.second.cast<py::function>());
+               } else if (py::isinstance<py::bool_>(element.second)) {
                  self.AddBoolOption(option_name, element.second.cast<bool>());
                } else if (py::isinstance<py::float_>(element.second)) {
                  self.AddDoubleOption(option_name,
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index be545ac9ce2f7..9f2ad6c62c7cf 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -96,8 +96,7 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
 
   // TODO(chenweihang): deal with multiple diff input Tensors
   // TODO(chenweihang): add global device guard method to set backend
-  void operator()(const Tensor& x) {
-    const phi::TensorBase& tensor = *x.impl();
+  inline void AssignKernelKeySet(const phi::TensorBase& tensor) {
     key_set.backend_set =
         key_set.backend_set | detail::GetTensorBackendSet(tensor);
     // TODO(chenweihang): select multi layout and dtype
@@ -110,6 +109,8 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     }
   }
 
+  void operator()(const Tensor& x) { AssignKernelKeySet(*x.impl()); }
+
   void operator()(const std::vector<Tensor>& x) {
     const phi::TensorBase& tensor = *x.at(0).impl();
     key_set.backend_set =
@@ -119,6 +120,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
     key_set.dtype = tensor.dtype();
   }
 
+  void operator()(const paddle::optional<const Tensor&> x) {
+    if (x.get_ptr() != nullptr) {
+      const phi::TensorBase& tensor = *(x.get_ptr()->impl());
+      AssignKernelKeySet(tensor);
+    }
+  }
+
   // skip other type args, these args don't used in kernel selection
   template <typename T>
   void operator()(const T& x) {
diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
index a37fbf35a26e8..326645726bbed 100644
--- a/paddle/phi/backends/dynload/cusparse.cc
+++ b/paddle/phi/backends/dynload/cusparse.cc
@@ -26,12 +26,13 @@ void *cusparse_dso_handle;
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
 
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
 #ifdef CUSPARSE_ROUTINE_EACH_11020
 CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
 #endif
+
+#ifdef CUSPARSE_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index d454fc0734c66..4dba0ab94ff20 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -140,8 +140,10 @@ std::vector<int> GetXPUSelectedDevices() {
 void MemcpySyncH2D(void* dst,
                    const void* src,
                    size_t count,
-                   const phi::XPUPlace& dst_place) {
+                   const phi::XPUPlace& dst_place,
+                   const phi::XPUContext& dev_ctx) {
   XPUDeviceGuard guard(dst_place.device);
+  dev_ctx.Wait();
   PADDLE_ENFORCE_XPU_SUCCESS(
       xpu_memcpy(dst, src, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
 }
diff --git a/paddle/phi/backends/xpu/xpu_info.h b/paddle/phi/backends/xpu/xpu_info.h
index fa7d1b5c18a7d..b1056cdc4b14b 100644
--- a/paddle/phi/backends/xpu/xpu_info.h
+++ b/paddle/phi/backends/xpu/xpu_info.h
@@ -49,7 +49,8 @@ std::vector<int> GetXPUSelectedDevices();
 void MemcpySyncH2D(void *dst,
                    const void *src,
                    size_t count,
-                   const phi::XPUPlace &dst_place);
+                   const phi::XPUPlace &dst_place,
+                   const phi::XPUContext &dev_ctx);
 void MemcpySyncD2H(void *dst,
                    const void *src,
                    size_t count,
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 0c6fdcb13912f..f47e8d550e693 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -80,9 +80,9 @@ struct KernelSignature {
 
   KernelSignature& operator=(KernelSignature&& other) noexcept {
     name = other.name;
-    input_names.swap(other.input_names);
-    attr_names.swap(other.attr_names);
-    output_names.swap(other.output_names);
+    input_names = std::move(other.input_names);
+    attr_names = std::move(other.attr_names);
+    output_names = std::move(other.output_names);
     return *this;
   }
 };
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 46c45837a5372..3c030cac2e7c9 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -371,12 +371,20 @@ dnnl::memory::format_tag DenseTensor::format() const {
 }
 #endif
 
+// NOTE: For historical reasons, this interface has a special behavior,
+// sharing other tensor members except lod
 DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
   src.check_memory_size();
-  // Preserve LoD
-  auto lod = meta_.lod;
-  *this = src;
-  meta_.lod = lod;
+  holder_ = src.holder_;
+  meta_.is_scalar = src.meta_.is_scalar;
+  meta_.dims = src.meta_.dims;
+  meta_.dtype = src.meta_.dtype;
+  meta_.layout = src.meta_.layout;
+  meta_.offset = src.meta_.offset;
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = src.format_;
+  mem_desc_ = src.mem_desc_;
+#endif
   return *this;
 }
 
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index 7d4261ef82972..bf4d601c0b566 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -115,4 +115,12 @@ void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
   this->coalesced_ = coalesced;
 }
 
+int32_t SparseCooTensor::sparse_dim() const {
+  return non_zero_indices_.dims()[0];
+}
+
+int32_t SparseCooTensor::dense_dim() const {
+  return dims_.size() - sparse_dim();
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index ec43c5d62179b..c65b5ce57430b 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -150,6 +150,12 @@ class SparseCooTensor : public TensorBase,
   /// \brief set the dims of original dense tensor
   void set_dims(const DDim& dims) { this->dims_ = dims; }
 
+  /// \brief get the sparse dim
+  int32_t sparse_dim() const;
+
+  /// \brief get the dnese dim
+  int32_t dense_dim() const;
+
  private:
   // save the indices of non zero elements in original dense tensor
   DenseTensor non_zero_indices_;
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index cff14308c7fe9..6d37a31f54562 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_slice.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
 #include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/impl/einsum_impl.h"
 
 namespace phi {
 
@@ -398,6 +399,47 @@ void EighInferMeta(const MetaTensor& x,
   out_v->set_dims(input_dim);
 }
 
+void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
+                     const std::string& equation,
+                     MetaTensor* out) {
+  // collect the following informations to prepare einsum.
+  LabelMap labelshape(0);
+  LabelMap labeltype(LabelType::Reduction);
+  std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
+  std::vector<char> all_labels;
+  std::vector<int> broadcast_dims;
+  std::vector<int> output_dims;
+  std::vector<std::vector<int>> ellipsis_dims(2);
+
+  std::vector<DDim> input_dims;
+  for (auto& i : inputs) {
+    input_dims.push_back(i->dims());
+  }
+  std::string right;
+  ParseEinsumEquation(equation,
+                      input_dims,
+                      &labelshape,
+                      &labeltype,
+                      &all_labels,
+                      &label2perms,
+                      &ellipsis_dims,
+                      &broadcast_dims,
+                      &output_dims,
+                      &right);
+
+  VLOG(3) << "Einsum Infershape: input dims:"
+          << paddle::string::join_strings(input_dims, "\n");
+  VLOG(3) << "Einsum Infershape: equation:" << equation;
+  VLOG(3) << "Einsum Infershape: all_labels:"
+          << paddle::string::join_strings(all_labels, ",");
+  VLOG(3) << "Einsum Infershape: output dims:"
+          << paddle::string::join_strings(output_dims, ",");
+  VLOG(3) << "Label Type is : " << label_to_string(all_labels, labeltype);
+  VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
+  out->set_dims(make_ddim(output_dims));
+  out->set_dtype(inputs[0]->dtype());
+}
+
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
                      MetaTensor* out) {
@@ -3011,7 +3053,7 @@ void UnStackInferMeta(const MetaTensor& x,
 }
 
 void OneHotRawInferMeta(const MetaTensor& x,
-                        int32_t depth,
+                        const Scalar& depth,
                         DataType dtype,
                         bool allow_out_of_range,
                         MetaTensor* out) {
@@ -3021,7 +3063,7 @@ void OneHotRawInferMeta(const MetaTensor& x,
       1,
       phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
   auto out_dims_vec = phi::vectorize(x_dims);
-  out_dims_vec.push_back(depth);
+  out_dims_vec.push_back(depth.to<int>());
   auto out_dims = phi::make_ddim(out_dims_vec);
   out->set_dims(out_dims);
   out->share_lod(x);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index eef750b852f06..559857bd6ce9b 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -80,6 +80,10 @@ void EighInferMeta(const MetaTensor& x,
                    MetaTensor* out_w,
                    MetaTensor* out_v);
 
+void EinsumInferMeta(const std::vector<const MetaTensor*>& inputs,
+                     const std::string& equation,
+                     MetaTensor* out);
+
 void ExpandInferMeta(const MetaTensor& x,
                      const IntArray& shape,
                      MetaTensor* out);
@@ -431,7 +435,7 @@ void UnStackInferMeta(const MetaTensor& x,
                       std::vector<MetaTensor*> outs);
 
 void OneHotRawInferMeta(const MetaTensor& x,
-                        int32_t depth,
+                        const Scalar& depth,
                         DataType dtype,
                         bool allow_out_of_range,
                         MetaTensor* out);
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index e298b5ff4e718..084843c31cf52 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -82,18 +82,18 @@ void ReluDoubleGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void TanhDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
+                          const DenseTensor& ddx,
                           DenseTensor* dout_new,
                           DenseTensor* ddout);
 
 template <typename T, typename Context>
 void TanhTripleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
-                          const DenseTensor& d_ddout,
+                          const DenseTensor& ddx,
                           const DenseTensor& d_dout_new,
+                          const DenseTensor& d_ddout,
                           DenseTensor* d_out_new,
                           DenseTensor* d_dout,
                           DenseTensor* d_ddx);
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
index 73752f015ca3a..2cb3b16a022b1 100644
--- a/paddle/phi/kernels/batch_norm_grad_kernel.h
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -66,16 +66,16 @@ void BatchNormGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& dev_ctx,
-                               const DenseTensor& x_grad_grad,
-                               const DenseTensor& scale_grad_grad,
-                               const DenseTensor& bias_grad_grad,
-                               const DenseTensor& y_grad,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               const DenseTensor& saved_mean,
-                               const DenseTensor& saved_variance,
                                paddle::optional<const DenseTensor&> mean,
                                paddle::optional<const DenseTensor&> variance,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
                                float momentum,
                                float epsilon,
                                const std::string& data_layout,
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index ae87886b89bff..bf01c24f4ffa3 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -341,16 +341,16 @@ void BatchNormGradKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context& ctx,
-                               const DenseTensor& x_grad_grad,
-                               const DenseTensor& scale_grad_grad,
-                               const DenseTensor& bias_grad_grad,
-                               const DenseTensor& y_grad,
                                const DenseTensor& x,
                                const DenseTensor& scale,
-                               const DenseTensor& saved_mean,
-                               const DenseTensor& saved_variance,
                                paddle::optional<const DenseTensor&> mean,
                                paddle::optional<const DenseTensor&> variance,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
                                float momentum,
                                float epsilon,
                                const std::string& data_layout_str,
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
index c785eacb9a8bc..b86ead04dbc5f 100644
--- a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -38,9 +38,9 @@ void SubtractGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
+                              const DenseTensor& dout,
                               paddle::optional<const DenseTensor&> ddx,
                               paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/cpu/one_hot_kernel.cc b/paddle/phi/kernels/cpu/one_hot_kernel.cc
index 04f7c6a1f606d..fc7979e41d938 100644
--- a/paddle/phi/kernels/cpu/one_hot_kernel.cc
+++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc
@@ -64,18 +64,19 @@ struct OneHotV2OpFunctor {
 template <typename T, typename Context>
 void OneHotRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     int32_t depth,
+                     const Scalar& depth,
                      DataType dtype,
                      bool allow_out_of_range,
                      DenseTensor* out) {
+  auto depth_v = depth.to<int>();
   auto out_dims = out->dims();
   if (out_dims[out_dims.size() - 1] == -1) {
-    out_dims[out_dims.size() - 1] = depth;
+    out_dims[out_dims.size() - 1] = depth_v;
     out->Resize(out_dims);
   }
 
   phi::VisitDataType(dtype,
-                     OneHotV2OpFunctor<Context, T>(&x, out, depth, dev_ctx));
+                     OneHotV2OpFunctor<Context, T>(&x, out, depth_v, dev_ctx));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 4cee24d2f8069..9d608cd86a6f7 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -103,7 +103,7 @@ PD_REGISTER_KERNEL(elementwise_pow,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(maximum,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MaximumKernel,
                    float,
@@ -113,7 +113,7 @@ PD_REGISTER_KERNEL(maximum,
                    phi::dtype::float16,
                    phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(minimum,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MinimumKernel,
                    float,
@@ -125,9 +125,9 @@ PD_REGISTER_KERNEL(minimum,
 PD_REGISTER_KERNEL(
     modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {}
 PD_REGISTER_KERNEL(
-    floor_divide, GPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
+    floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {}
 PD_REGISTER_KERNEL(elementwise_pow,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::ElementwisePowKernel,
                    float,
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
index 7be91b4b9f4cd..97df769f4d046 100644
--- a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -30,9 +30,9 @@ void SubtractGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
+                              const DenseTensor& dout,
                               paddle::optional<const DenseTensor&> ddx,
                               paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout);
 
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 8d9dd65786705..4c2b6ef896e71 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
+#if defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/math_xpu2.h"  //pow()
+#endif
 
 namespace phi {
 namespace funcs {
@@ -573,6 +577,9 @@ struct ElementwisePowFunctor {
       return std::llrint(
           std::pow(static_cast<double>(a), static_cast<double>(b)));
     }
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+    return pow(a, b);
 #endif
     return std::pow(a, b);
   }
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 1021b510b26cd..7508d8ee8cdc8 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
@@ -978,7 +979,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
     // suppose perfoemance improves with h increased.
     dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
     dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
-    auto gplace = phi::GPUPlace();
+    auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
     auto *ctx = static_cast<GPUContext *>(
         paddle::platform::DeviceContextPool::Instance().Get(gplace));
     paddle::platform::LimitGridDim(*ctx, &grid_size);
@@ -1003,7 +1004,7 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
                                        T *dy) {
   int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
   dim3 grid_size = dim3(n);
-  auto gplace = phi::GPUPlace();
+  auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
   auto *ctx = static_cast<GPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(gplace));
   paddle::platform::LimitGridDim(*ctx, &grid_size);
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index e15b4cc10d97e..ad3b8579ddf67 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -908,16 +908,16 @@ void BatchNormGradKernel(const Context &dev_ctx,
 
 template <typename T, typename Context>
 void BatchNormDoubleGradKernel(const Context &ctx,
-                               const DenseTensor &x_grad_grad,
-                               const DenseTensor &scale_grad_grad,
-                               const DenseTensor &bias_grad_grad,
-                               const DenseTensor &y_grad,
                                const DenseTensor &x,
                                const DenseTensor &scale,
-                               const DenseTensor &saved_mean,
-                               const DenseTensor &saved_variance,
                                paddle::optional<const DenseTensor &> mean,
                                paddle::optional<const DenseTensor &> variance,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
                                float momentum,
                                float epsilon,
                                const std::string &data_layout_str,
@@ -988,10 +988,9 @@ PD_REGISTER_KERNEL(batch_norm_grad,
                    double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
-    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
 
@@ -1003,10 +1002,9 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
-    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
-    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
   }
 }
 
@@ -1019,7 +1017,6 @@ PD_REGISTER_KERNEL(batch_norm_grad_grad,
                    phi::BatchNormDoubleGradKernel,
                    float,
                    double) {}
-
 #else
 PD_REGISTER_KERNEL(batch_norm_grad_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
index e04f2b5f87658..13975ddd3ef89 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -39,14 +39,12 @@ __device__ void BlockReverse(
   int tx = threadIdx.x;
 
   int offset = tx;
-  int in_index = src_base + offset;
-  if (offset >= valid_item) {
-    sh_mem[offset] = 0;
-  } else {
-    int sh_mem_index = BLOCK_SIZE - offset - 1;
-    T data = idata[in_index];
-    sh_mem[sh_mem_index] = data;
+  T src_data = 0;
+  int src_offset = BLOCK_SIZE - offset - 1;
+  if (src_offset < valid_item) {
+    src_data = idata[src_base + src_offset];
   }
+  sh_mem[offset] = src_data;
 
   __syncthreads();
   int out_index = dst_base - offset;
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
index 20f3b73e4094f..017616df2782c 100644
--- a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -46,9 +46,9 @@ void SubtractGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& y,
+                              const DenseTensor& dout,
                               paddle::optional<const DenseTensor&> ddx,
                               paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
   phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu
index c5884884231a8..2ae9e9333ecb5 100644
--- a/paddle/phi/kernels/gpu/one_hot_kernel.cu
+++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu
@@ -73,18 +73,19 @@ struct OneHotV2OpCUDAFunctor {
 template <typename T, typename Context>
 void OneHotRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     int32_t depth,
+                     const Scalar& depth,
                      DataType dtype,
                      bool allow_out_of_range,
                      DenseTensor* out) {
+  auto depth_v = depth.to<int>();
   auto out_dims = out->dims();
   if (out_dims[out_dims.size() - 1] == -1) {
-    out_dims[out_dims.size() - 1] = depth;
+    out_dims[out_dims.size() - 1] = depth_v;
     out->Resize(out_dims);
   }
 
   phi::VisitDataType(
-      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth, dev_ctx));
+      dtype, OneHotV2OpCUDAFunctor<Context, T>(&x, out, depth_v, dev_ctx));
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index bf9b7cdf559d3..2f35acc095085 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -152,8 +152,8 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void TanhDoubleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
+                          const DenseTensor& ddx,
                           DenseTensor* dout_new,
                           DenseTensor* ddout) {
   if (dout_new) {
@@ -171,10 +171,10 @@ void TanhDoubleGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void TanhTripleGradKernel(const Context& dev_ctx,
                           const DenseTensor& out,
-                          const DenseTensor& ddx,
                           const DenseTensor& dout,
-                          const DenseTensor& d_ddout,
+                          const DenseTensor& ddx,
                           const DenseTensor& d_dout_new,
+                          const DenseTensor& d_ddout,
                           DenseTensor* d_out_new,
                           DenseTensor* d_dout,
                           DenseTensor* d_ddx) {
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index d4be007a07fc0..73940a45cbde2 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/matmul_kernel.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
@@ -21,6 +20,7 @@
 #include "paddle/utils/string/string_helper.h"
 
 namespace phi {
+
 // check the validation of the Einsum equation.
 // 1. the label must between 'a' - 'z'.
 // 2. the dim of the same label must be same.
@@ -302,45 +302,6 @@ inline static void ParseEinsumEquation(
   }
 }
 
-inline void EinsumInferShape(const std::vector<const MetaTensor*>& inputs,
-                             const std::string& equation,
-                             MetaTensor* out) {
-  // collect the following informations to prepare einsum.
-  LabelMap labelshape(0);
-  LabelMap labeltype(LabelType::Reduction);
-  std::vector<LabelMap> label2perms(inputs.size(), LabelMap(-1));
-  std::vector<char> all_labels;
-  std::vector<int> broadcast_dims;
-  std::vector<int> output_dims;
-  std::vector<std::vector<int>> ellipsis_dims(2);
-
-  std::vector<DDim> input_dims;
-  for (auto& i : inputs) {
-    input_dims.push_back(i->dims());
-  }
-  std::string right;
-  ParseEinsumEquation(equation,
-                      input_dims,
-                      &labelshape,
-                      &labeltype,
-                      &all_labels,
-                      &label2perms,
-                      &ellipsis_dims,
-                      &broadcast_dims,
-                      &output_dims,
-                      &right);
-
-  VLOG(3) << "Einsum Infershape: input dims:"
-          << paddle::string::join_strings(input_dims, "\n");
-  VLOG(3) << "Einsum Infershape: equation:" << equation;
-  VLOG(3) << "Einsum Infershape: all_labels:"
-          << paddle::string::join_strings(all_labels, ",");
-  VLOG(3) << "Einsum Infershape: output dims:"
-          << paddle::string::join_strings(output_dims, ",");
-  VLOG(3) << "Label Type is : " << label_to_string(all_labels, labeltype);
-  VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape);
-}
-
 template <typename T>
 std::vector<T> GetLabelIndexByType(const std::vector<char>& all_labels,
                                    const LabelMap& type,
@@ -394,6 +355,13 @@ DenseTensor PerformReduction(const Context& dev_ctx,
   return Sum<T, Context>(dev_ctx, tensor, indices, tensor.dtype(), true);
 }
 
+inline bool is_no_need_transpose(const std::vector<int>& axis) {
+  for (size_t i = 0; i < axis.size(); ++i) {
+    if (i != static_cast<size_t>(axis[i])) return false;
+  }
+  return true;
+}
+
 template <typename T, typename Context>
 DenseTensor PerformTranspose(const Context& dev_ctx,
                              const DenseTensor& tensor,
@@ -401,12 +369,6 @@ DenseTensor PerformTranspose(const Context& dev_ctx,
                              const std::vector<char>& all_labels,
                              const std::vector<int>& ellipsis,
                              const LabelMap& label2type) {
-  auto is_no_need_transpose = [](std::vector<int>& axis) {
-    for (size_t i = 0; i < axis.size(); ++i) {
-      if (i != size_t(axis[i])) return false;
-    }
-    return true;
-  };
   auto axis = GetLabelIndexByType<int>(
       all_labels, label2type, label2perm, ellipsis, LabelType::ALL_TYPE);
   VLOG(5) << "PerformTranspose: " << paddle::string::join_strings(axis, ",");
@@ -496,9 +458,9 @@ void TransposeToOutput(const Context& dev_ctx,
       axis.push_back(it - all_labels.begin() + offset);
     }
   }
+  if (is_no_need_transpose(axis)) return output->ShareBufferWith(to_trans);
   VLOG(5) << "call TransposeToOutput: with axis: "
           << paddle::string::join_strings(axis, ",");
-  if (axis.size() == 0) return output->ShareBufferWith(to_trans);
   return TransposeKernel<T, Context>(dev_ctx, to_trans, axis, output);
 }
 
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index aba4a5f5fbd43..fa1f15672b903 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -360,6 +360,14 @@ struct MulGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
 };
 
+// avoid [-Wint-in-bool-context] warning
+template <>
+struct MulGradDX<bool> {
+  HOSTDEVICE bool operator()(bool x, bool y, bool out, bool dout) const {
+    return dout && y;
+  }
+};
+
 template <typename T>
 struct MulGradDX<phi::dtype::complex<T>> {
   HOSTDEVICE phi::dtype::complex<T> operator()(
@@ -383,6 +391,14 @@ struct MulGradDY {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
 };
 
+// avoid [-Wint-in-bool-context] warning
+template <>
+struct MulGradDY<bool> {
+  HOSTDEVICE bool operator()(bool x, bool y, bool out, bool dout) const {
+    return dout && x;
+  }
+};
+
 template <typename T>
 struct MulGradDY<phi::dtype::complex<T>> {
   HOSTDEVICE phi::dtype::complex<T> operator()(
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
index b5532c614314f..8f7d45771d9d0 100644
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -36,6 +36,7 @@ void AddKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
 PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
 #else
 
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
index 852babe29dbf7..827c478de9775 100644
--- a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -37,6 +37,7 @@ void DivideKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {}
 PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
 #else
 
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 5ccd3b1a48210..821fda52ab102 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -24,24 +24,65 @@ namespace phi {
 
 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
+template <typename T, typename Context>
+void MaximumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MaximumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Minimum
 DEFINE_CUDA_ELEMENTWISE_OP(Minimum)
+template <typename T, typename Context>
+void MinimumKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  int axis = -1;
+  MinimumRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Modulo
 DEFINE_CUDA_ELEMENTWISE_OP(Modulo)
 // Create the definition of FloorDivide
 DEFINE_CUDA_ELEMENTWISE_OP(FloorDivide)
+template <typename T, typename Context>
+void FloorDivideKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  int axis = -1;
+  FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 // Create the definition of Pow
 DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
+template <typename T, typename Context>
+void ElementwisePowKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& y,
+                          DenseTensor* out) {
+  int axis = -1;
+  ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
+}
 
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
 }
+PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
+}
 PD_REGISTER_KERNEL(
     floor_divide_raw, KPS, ALL_LAYOUT, phi::FloorDivideRawKernel, int) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow, KPS, ALL_LAYOUT, phi::ElementwisePowKernel, float) {}
+PD_REGISTER_KERNEL(
+    elementwise_pow_raw, KPS, ALL_LAYOUT, phi::ElementwisePowRawKernel, float) {
+}
 
 #else
 using float16 = phi::dtype::float16;
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
index 8bede0198c2fa..99408ff214268 100644
--- a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -37,6 +37,7 @@ void MultiplyKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {}
 PD_REGISTER_KERNEL(
     multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 #else
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
index 757dedb99c931..b99f687b59f4e 100644
--- a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -37,6 +37,7 @@ void SubtractKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(
     subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
 #else
diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu
index b732d371ad1ef..815675953953d 100644
--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -65,9 +65,9 @@ void LogicalNotKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(logical_and, KPS, ALL_LAYOUT, phi::LogicalAndKernel, int) {}
-PD_REGISTER_KERNEL(logical_Or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
-PD_REGISTER_KERNEL(logical_Not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
-PD_REGISTER_KERNEL(logical_Xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
+PD_REGISTER_KERNEL(logical_or, KPS, ALL_LAYOUT, phi::LogicalOrKernel, int) {}
+PD_REGISTER_KERNEL(logical_not, KPS, ALL_LAYOUT, phi::LogicalNotKernel, int) {}
+PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {}
 #else
 #define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
   PD_REGISTER_KERNEL(logical_and,                            \
diff --git a/paddle/phi/kernels/one_hot_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc
index 633f48cbb62ac..755e06752509a 100644
--- a/paddle/phi/kernels/one_hot_kernel.cc
+++ b/paddle/phi/kernels/one_hot_kernel.cc
@@ -24,9 +24,8 @@ void OneHotKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& num_classes_s,
                   DenseTensor* out) {
-  int num_classes = num_classes_s.to<int>();
   OneHotRawKernel<T>(
-      dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out);
+      dev_ctx, x, num_classes_s, phi::DataType::FLOAT32, false, out);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/one_hot_kernel.h b/paddle/phi/kernels/one_hot_kernel.h
index 9f89609ea6336..79af88473b278 100644
--- a/paddle/phi/kernels/one_hot_kernel.h
+++ b/paddle/phi/kernels/one_hot_kernel.h
@@ -28,7 +28,7 @@ void OneHotKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void OneHotRawKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                     int32_t depth,
+                     const Scalar& depth,
                      DataType dtype,
                      bool allow_out_of_range,
                      DenseTensor* out);
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
old mode 100755
new mode 100644
index b01e0474f2d02..fdcbb5ec9cc8d
--- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -124,7 +124,8 @@ struct MaxFunctor {
  */
 template <typename T>
 struct AddFunctor {
-  inline T initial() { return static_cast<T>(0.0f); }
+  inline T initial() { /*return static_cast<T>(0.0f);*/
+  }
 
   __device__ T operator()(const T a, const T b) const { return b + a; }
 };
@@ -134,7 +135,8 @@ struct AddFunctor {
  */
 template <typename T>
 struct MulFunctor {
-  inline T initial() { return static_cast<T>(1.0f); }
+  inline T initial() { /*return static_cast<T>(1.0f);*/
+  }
 
   __device__ T operator()(const T& a, const T& b) const { return b * a; }
 };
@@ -144,7 +146,8 @@ struct MulFunctor {
  */
 template <typename T>
 struct LogicalOrFunctor {
-  inline T initial() { return static_cast<T>(false); }
+  inline T initial() { /*return static_cast<T>(false);*/
+  }
 
   __device__ T operator()(const T& a, const T& b) const { return b || a; }
 };
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 0ec8b808ba838..0e5714b174361 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -39,7 +39,7 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
       phi::errors::InvalidArgument("the input x and mask must have the shape"));
   const DenseTensor& indices = mask.non_zero_indices();
   const DenseTensor& values = mask.non_zero_elements();
-  int sparse_dim = indices.dims().size();
+  const int sparse_dim = mask.sparse_dim();
 
   DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
   DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
@@ -95,7 +95,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
       2,
       phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
 
-  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  const int32_t sparse_dim = x.sparse_dim();
 
   std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
       mask_indexs(mask_indices.dims()[1]);
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
index 78b6354f44f9e..71a0095395552 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -50,7 +50,7 @@ void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
   DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
   T* x_grad_ptr = x_grad_values.data<T>();
-  memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
+  memset(x_grad_ptr, 0, sizeof(T) * x_grad_values.numel());
   phi::Copy<CPUContext>(dev_ctx,
                         x.non_zero_indices(),
                         dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 685aa6b30bdc1..69ac0417f763d 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -254,7 +254,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   if (indices_dims.size() == 1) {
     sparse_dim = 1;
   }
-  const int64_t dense_dim = values.dims().size() - 1;
+  const int64_t dense_dim = x.dense_dim();
 
   const T* x_data = values.data<T>();
   *out = phi::Empty(
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 4253845956ea7..81c63c48ebff2 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -42,7 +42,7 @@ __global__ void MaskKernel(const T* x_ptr,
     int64_t col_i = i - out_i * cols;
     int64_t index = 0;
     for (int j = 0; j < sparse_dim; j++) {
-      index += indices_ptr[j * non_zero_num + i] * sparse_offsets[j];
+      index += indices_ptr[j * non_zero_num + out_i] * sparse_offsets[j];
     }
     out_values_ptr[out_i * cols + col_i] = x_ptr[index * cols + col_i];
   }
@@ -60,16 +60,13 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
       phi::errors::InvalidArgument("the input x and mask must have the shape"));
   const DenseTensor& indices = mask.non_zero_indices();
   const DenseTensor& values = mask.non_zero_elements();
-  int sparse_dim = indices.dims().size();
+  const int sparse_dim = mask.sparse_dim();
   DenseTensor sparse_offsets = phi::Empty<GPUContext>(
       dev_ctx,
       DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW));
   std::vector<int64_t> h_sparse_offsets(sparse_dim);
-  int64_t offset = 1;
-  for (int i = sparse_dim - 1; i >= 0; i--) {
-    h_sparse_offsets[i] = offset;
-    offset *= dims[i];
-  }
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      dims, sparse_dim, h_sparse_offsets.data());
 
   phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data<int64_t>(),
                                      &h_sparse_offsets[0],
@@ -151,7 +148,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
       2,
       phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
 
-  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  const int32_t sparse_dim = x.sparse_dim();
   auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
 
   std::vector<IntT> sparse_offsets(sparse_dim);
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
index bd862a44afeeb..c22e67eef6712 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -64,7 +64,7 @@ void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
   int rulebook_len = rulebook.dims()[1];
   const IntT* rulebook_ptr = rulebook.data<IntT>();
   std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
-      h_counter(kernel_size);
+      h_counter(rulebook_len, 0);
   phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
                                      rulebook_ptr,
                                      rulebook_len * sizeof(IntT),
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 5900b49946623..157eaa279debb 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -121,13 +121,13 @@ KernelSignature ReluDoubleGradOpArgumentMapping(
 KernelSignature TanhDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"});
+      "tanh_double_grad", {"Out", "DOut", "DDX"}, {}, {"DOutNew", "DDOut"});
 }
 
 KernelSignature TanhTripleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("tanh_triple_grad",
-                         {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"},
+                         {"Out", "DOut", "DDX", "D_DOut_New", "D_DDOut"},
                          {},
                          {"D_OutNew", "D_DOut", "D_DDx"});
 }
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
index 14affe60b9d55..1c6b63d70c705 100644
--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -82,16 +82,16 @@ KernelSignature BatchNormGradOpArgumentMapping(
 KernelSignature BatchNormGradGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("batch_norm_grad_grad",
-                         {"DDX",
-                          "DDScale",
-                          "DDBias",
-                          "DY",
-                          "X",
+                         {"X",
                           "Scale",
+                          "Mean",
+                          "Variance",
                           "SavedMean",
                           "SavedVariance",
-                          "Mean",
-                          "Variance"},
+                          "DY",
+                          "DDX",
+                          "DDScale",
+                          "DDBias"},
                          {"momentum",
                           "epsilon",
                           "data_layout",
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index 19110eb0e0ab8..13a5a6fd4a449 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -133,7 +133,7 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
 KernelSignature ElementwiseSubDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
-      "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
+      "subtract_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"});
 }
 
 KernelSignature ElementwiseDivGradOpArgumentMapping(
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9c5eef6292581..0e1d0660322bd 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -752,6 +752,8 @@ function run_linux_cpu_test() {
     pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
+    cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
     cat <<EOF
@@ -957,7 +959,7 @@ function check_whl_size() {
 
     whldiffSize=`echo $(($pr_whl_size - $dev_whl_size))`
     if [ ${whldiffSize} -gt 10 ]; then
-       approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+       approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22334008 22361972`
        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
        if [ "${APPROVALS}" == "FALSE" ]; then
@@ -1051,13 +1053,13 @@ function generate_api_spec() {
 
 function check_approvals_of_unittest() {
     set +x
-    if [ "$GITHUB_API_TOKEN" == "" ] || [ "$GIT_PR_ID" == "" ]; then
+    if [ "$GIT_PR_ID" == "" ]; then
         return 0
     fi
     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420
     check_times=$1
     if [ $check_times == 1 ]; then
-        approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+        approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
         if [ "${approval_line}" != "" ]; then
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
@@ -1071,7 +1073,7 @@ function check_approvals_of_unittest() {
     elif [ $check_times == 2 ]; then
         unittest_spec_diff=`python ${PADDLE_ROOT}/tools/diff_unittest.py ${PADDLE_ROOT}/paddle/fluid/UNITTEST_DEV.spec ${PADDLE_ROOT}/paddle/fluid/UNITTEST_PR.spec`
         if [ "$unittest_spec_diff" != "" ]; then
-            approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+            approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
@@ -1104,7 +1106,7 @@ function check_approvals_of_unittest() {
 EOF
         if [ `echo "20 < $AllDiffSize"|bc` -eq 1 ] ; then
             
-            approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+            approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 39303645 328693`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
@@ -1815,7 +1817,14 @@ function precise_card_test() {
     echo "****************************************************************"
     
     tmpfile=$tmp_dir/$testcases".log"
+    tmpfile1=$tmp_dir/$testcases"-gpu.log"
+    nvidia-smi --id=0 --query-compute-apps=used_memory --format=csv -lms 10 > $tmpfile1 2>&1 &
+    gpu_memory_pid=$!
     env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I 0,,1 -R "($testcases)" --timeout 500 --output-on-failure -V -j 1 > $tmpfile 
+    kill ${gpu_memory_pid}
+    cat $tmpfile1 | tr -d ' MiB' | awk 'BEGIN {max = 0} {if(NR>1){if ($1 > max) max=$1}} END {print "MAX_GPU_MEMORY_USE=", max}' >> $tmpfile 
+    cat $tmpfile1 | tr -d ' MiB' | awk 'BEGIN {sum = 0} {if(NR>1){sum = sum + $1 }} END {print "AVG_GPU_MEMORY_USE=", sum / (NR-2)}' >> $tmpfile 
+    rm -rf $tmpfile1
     set +m
 }
 
@@ -1907,8 +1916,11 @@ set -x
     python ${PADDLE_ROOT}/tools/pyCov_multithreading.py ${PADDLE_ROOT}
     wait;
 
-    #generate ut map
+    #generate ut file map
     python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
+
+    #generate ut mem map
+    python ${PADDLE_ROOT}/tools/get_ut_mem_map.py $tmp_dir 
 }
 
 function get_failedUts_precise_map_file {
@@ -2404,6 +2416,8 @@ function parallel_test() {
     pip install hypothesis
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
+    cp -r ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
     if [ "$WITH_CINN" == "ON" ];then
         parallel_test_base_cinn
@@ -2839,7 +2853,7 @@ function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
     # Xreki 12538138, luotao1 6836917, ZzSean 32410583
     set +x
-    approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
+    approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
         APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
@@ -2891,11 +2905,11 @@ function summary_check_problems() {
 
 function reuse_so_cache() {
     get_html="https://api.github.com/repos/PaddlePaddle/Paddle"
-    curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    curl -X GET ${get_html}/commits >tmp.txt
     merge_commit=`grep "sha" tmp.txt| awk -F \" 'NR==1{print $(NF-1)}'| sed 's# ##g'`
-    curl -X GET ${get_html}/commits/${merge_commit} -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    curl -X GET ${get_html}/commits/${merge_commit} >tmp.txt
     merge_pr=`grep -oP -m 1 '(#[0-9]*)' tmp.txt| sed 's/#//g'`
-    curl -X GET ${get_html}/pulls/${merge_pr}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    curl -X GET ${get_html}/pulls/${merge_pr}/commits >tmp.txt
     pr_commit=`grep "sha" tmp.txt |tail -3|head -1|awk -F : '{print $NF}'|sed 's#"##g'|sed 's#,##g'| sed 's# ##g'`
     set +e
     wget -q https://xly-devops.bj.bcebos.com/PR/Paddle/${merge_pr}/${pr_commit}/workspace/Paddle/build/proto_so.tar.gz
@@ -2989,7 +3003,7 @@ function check_coverage_build() {
 
     set +x
     if [ ${diff_coverage_build_size} -gt 3 ]; then
-       approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+       approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
        APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 29832297 6836917 43953930`
        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
        if [ "${APPROVALS}" == "FALSE" ]; then
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 2c977e923b5b1..f5cfd14e6b84c 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,5 +1,11 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc phi_utils)
+  set(paddle_gtest_main_deps device_context gtest gflags init memory phi_utils proto_desc)
+
+  if (WITH_GPU OR WITH_ROCM)
+    list(APPEND paddle_gtest_main_deps gpu_info)
+  endif()
+
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS ${paddle_gtest_main_deps})
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index bb919f0e9110c..16c683e39fa8c 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/init.h"
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(enable_gpu_memory_usage_log);
+#endif
+
 int main(int argc, char** argv) {
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
   testing::InitGoogleTest(&argc, argv);
@@ -81,6 +85,13 @@ int main(int argc, char** argv) {
     VLOG(1) << "gtest undefok_string:" << undefok_string;
   }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (strstr(undefok_str, "enable_gpu_memory_usage_log")) {
+    VLOG(1) << "Set FLAGS_enable_gpu_memory_usage_log to true";
+    FLAGS_enable_gpu_memory_usage_log = true;
+  }
+#endif
+
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
   ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
diff --git a/paddle/utils/variant.h b/paddle/utils/variant.h
index 7b11ae1bee88c..50bdc4287e21a 100644
--- a/paddle/utils/variant.h
+++ b/paddle/utils/variant.h
@@ -13,6 +13,10 @@
 
 #pragma once
 
+// gcc >= 9 has a bug that creates a false positive warning.
+// Reference:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92145
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89381
 #if defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 9
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-copy"
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index e303ce1216822..408a1fdaafeef 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -648,6 +648,9 @@ def complete_forward_annotation(self, serial_main_program):
         self._dist_context.copy_dist_attr_from_graph_to_program()
         self._dist_context.clear_dist_info_for_graph()
 
+        # NOTE:[HighOrderGrad] update vars and ops distributed attribute in high order gradient
+        self.complete_high_order_grad_annotation(serial_main_program)
+
         # Do the validation check and amend some completion
         self._dist_context.amend_dist_attr_for_program()
 
@@ -655,6 +658,164 @@ def complete_forward_annotation(self, serial_main_program):
 
         return serial_main_program
 
+    def complete_high_order_grad_annotation(self, serial_main_program):
+        """
+        NOTE: 
+            [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient.
+            This function is temporary to support high order gradient, and will be removed in the future.
+        """
+
+        def _is_grad_var_name(name):
+            if "@GRAD" in name:
+                return True
+            return False
+
+        def _get_op_by_id(ops, id):
+            for op in ops:
+                if op.desc.id() == id:
+                    return op
+            return None
+
+        ops = list(serial_main_program.global_block().ops)
+        vars = serial_main_program.global_block().vars
+        dist_op_context = self._dist_context.dist_op_context
+        grad_var_to_var = dist_op_context.grad_var_to_var
+
+        appended_grad_times = 0
+        for idx in range(0, len(ops)):
+            op = ops[idx]
+            if int(op.attr('op_role')) == int(
+                    core.op_proto_and_checker_maker.OpRole.Forward):
+                continue
+
+            if int(op.attr('op_role')) == int(
+                    core.op_proto_and_checker_maker.OpRole.Backward) and int(
+                        ops[idx - 1].attr('op_role')) == int(
+                            core.op_proto_and_checker_maker.OpRole.Forward):
+                appended_grad_times += 1
+
+            # complete the annotation of grad op (xxx_grad op or sum op)
+            # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
+            grad_op = ops[idx]
+            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+                # TODO support the case where one forward op corresponding to multiple xxx_grad op
+                forward_op = _get_op_by_id(
+                    ops, dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                assert forward_op is not None
+
+                fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
+                    forward_op)
+                fwd_op_process_mesh = fwd_op_dist_attr.process_mesh
+                grad_op_dist_attr = OperatorDistributedAttribute()
+                grad_op_dist_attr.process_mesh = fwd_op_process_mesh
+
+                for input_name in grad_op.input_arg_names:
+                    if input_name not in forward_op.input_arg_names and input_name not in forward_op.output_arg_names:
+                        if input_name in grad_var_to_var[appended_grad_times]:
+                            fwd_name = grad_var_to_var[appended_grad_times][
+                                input_name]
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
+                                fwd_name)
+                        else:
+                            input_var = vars[input_name]
+                            ref_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                                input_var).dims_mapping
+                    else:
+                        if fwd_op_dist_attr.get_input_dims_mapping(input_name):
+                            ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
+                                input_name)
+                        else:
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
+                                input_name)
+                    assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
+                        input_name)
+                    grad_op_dist_attr.set_input_dims_mapping(input_name,
+                                                             ref_dims_mapping)
+
+                for output_name in grad_op.output_arg_names:
+                    assert output_name in grad_var_to_var[appended_grad_times]
+                    fwd_name = grad_var_to_var[appended_grad_times][output_name]
+                    ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
+                        fwd_name)
+                    # var
+                    output_var = vars[output_name]
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = fwd_op_process_mesh
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr.set_output_dims_mapping(output_name,
+                                                              ref_dims_mapping)
+
+                self._dist_context.set_op_dist_attr_for_program(
+                    grad_op, grad_op_dist_attr)
+
+            # grad ops that have not a corresponding mapping in grad_op_id_to_op_id
+            else:
+
+                if grad_op.type == 'sum':
+                    assert all(map(_is_grad_var_name, grad_op.input_arg_names))
+                    output_name = grad_op.output_arg_names[0]
+                    assert output_name in grad_var_to_var[appended_grad_times], \
+                        "sum op's output '{}' has no corresponding var".format(
+                        output_name)
+                    ref_fwd_var_name = grad_var_to_var[appended_grad_times][
+                        output_name]
+                    ref_fwd_var = vars[ref_fwd_var_name]
+                    ref_fwd_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_fwd_var)
+                    ref_fwd_dims_mapping = ref_fwd_dist_attr.dims_mapping
+                    ref_fwd_process_mesh = ref_fwd_dist_attr.process_mesh
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_fwd_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_fwd_process_mesh
+                    output_var = vars[output_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_fwd_process_mesh
+                    for var_name in grad_op.input_arg_names:
+                        grad_op_dist_attr.set_input_dims_mapping(
+                            var_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_name, ref_fwd_dims_mapping)
+
+                elif grad_op.type == 'fill_zeros_like':
+                    ref_var_name = grad_op.input_arg_names[0]
+                    ref_var = vars[ref_var_name]
+                    ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_var)
+                    ref_dims_mapping = ref_dist_attr.dims_mapping
+                    ref_process_mesh = ref_dist_attr.process_mesh
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_process_mesh
+                    output_var_name = grad_op.output_arg_names[0]
+                    output_var = vars[output_var_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_process_mesh
+                    grad_op_dist_attr.set_input_dims_mapping(ref_var_name,
+                                                             ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(output_var_name,
+                                                              ref_dims_mapping)
+
+                elif grad_op.type in ['shape', 'fill_constant']:
+                    continue
+
+                else:
+                    raise ValueError("got unexpect op [{}]".format(
+                        str(grad_op.type)))
+
+                self._dist_context.set_op_dist_attr_for_program(
+                    grad_op, grad_op_dist_attr)
+
     def complete_backward_annotation(self, serial_main_program):
         """Complete the annotation of vars and ops in the backward phase for parallel program."""
 
@@ -689,6 +850,8 @@ def _get_op_by_id(ops, id):
         ops = list(serial_main_program.global_block().ops)
         vars = serial_main_program.global_block().vars
         dist_op_context = self._dist_context.dist_op_context
+        grad_var_to_var = dist_op_context.grad_var_to_var[len(
+            dist_op_context.grad_var_to_var)]
 
         for idx in range(first_backward_op_idx, len(ops)):
 
@@ -765,102 +928,111 @@ def _get_op_by_id(ops, id):
                         grad_op, grad_op_dist_attr)
                     continue
 
-                # op dist attr
-                forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
+                fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
                     forward_op)
-                forward_op_process_mesh = forward_op_dist_attr.process_mesh
+                fwd_op_process_mesh = fwd_op_dist_attr.process_mesh
                 grad_op_dist_attr = OperatorDistributedAttribute()
-                grad_op_dist_attr.process_mesh = forward_op_process_mesh
+                grad_op_dist_attr.process_mesh = fwd_op_process_mesh
 
-                # var
                 for input_name in grad_op.input_arg_names:
-                    input_var = vars[input_name]
-                    ref_dims_mapping = None
-                    if "@GRAD" in input_name:
-                        forward_name = _get_forward_varname_from_grad_varname(
-                            input_name)
-                        ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
-                            forward_name)
+                    if input_name not in forward_op.input_arg_names and input_name not in forward_op.output_arg_names:
+                        if input_name in grad_var_to_var:
+                            fwd_name = grad_var_to_var[input_name]
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
+                                fwd_name)
+                        else:
+                            input_var = vars[input_name]
+                            ref_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                                input_var).dims_mapping
                     else:
-                        if forward_op_dist_attr.get_input_dims_mapping(
-                                input_name):
-                            ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
+                        if fwd_op_dist_attr.get_input_dims_mapping(input_name):
+                            ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
                                 input_name)
                         else:
-                            ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
+                            ref_dims_mapping = fwd_op_dist_attr.get_output_dims_mapping(
                                 input_name)
-
                     assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
-                        input_var.name)
+                        input_name)
                     grad_op_dist_attr.set_input_dims_mapping(input_name,
                                                              ref_dims_mapping)
 
-                for output_name in grad_op.desc.output_names():
-                    assert len(grad_op.desc.output(output_name)) in [0, 1]
-                    if _is_grad_var_name(output_name):
-                        input_name = _get_forward_varname_from_grad_varname(
-                            output_name)
-                    else:
-                        assert grad_op.type in [
-                            "cast", "c_identity", "c_allreduce_sum"
-                        ]
-                        input_name = "X"
-                    assert input_name in forward_op.desc.input_names(
-                    ), "var [{}] in op [{}]'s output but could not find [{}] in its forward op".format(
-                        output_name, grad_op.type, input_name)
-                    if len(grad_op.desc.output(output_name)) == 1:
-                        # tensor dist attr
-                        output_var = vars[grad_op.desc.output(output_name)[0]]
-                        forward_name = _get_forward_varname_from_grad_varname(
-                            output_var.name)
-                        ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
-                            forward_name)
-
-                        output_var_dist_attr = TensorDistributedAttribute()
-                        output_var_dist_attr.dims_mapping = ref_dims_mapping
-                        output_var_dist_attr.process_mesh = forward_op_process_mesh
-                        self._dist_context.set_tensor_dist_attr_for_program(
-                            output_var, output_var_dist_attr)
-
-                        grad_op_dist_attr.set_output_dims_mapping(
-                            output_var.name, ref_dims_mapping)
+                for output_name in grad_op.output_arg_names:
+                    assert output_name in grad_var_to_var
+                    fwd_name = grad_var_to_var[output_name]
+                    ref_dims_mapping = fwd_op_dist_attr.get_input_dims_mapping(
+                        fwd_name)
+                    # var
+                    output_var = vars[output_name]
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = fwd_op_process_mesh
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr.set_output_dims_mapping(output_name,
+                                                              ref_dims_mapping)
 
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
-            # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id
+            # grad ops that have not a corresponding mapping in grad_op_id_to_op_id
             else:
-                assert grad_op.type == "sum", "got unexpect op [{}]".format(
-                    str(grad_op.type))
-                assert all(map(_is_grad_var_name, grad_op.input_arg_names))
-                assert len(grad_op.output_arg_names) == 1
-
-                ref_forward_var_name = _get_forward_varname_from_grad_varname(
-                    grad_op.output_arg_names[0])
-                forward_var = vars[ref_forward_var_name]
-                ref_forward_var_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
-                    forward_var).dims_mapping
-                ref_forward_var_process_mesh = self._dist_context.get_tensor_dist_attr_for_program(
-                    forward_var).process_mesh
+                if grad_op.type == 'sum':
+                    assert all(map(_is_grad_var_name, grad_op.input_arg_names))
+                    output_name = grad_op.output_arg_names[0]
+                    assert output_name in grad_var_to_var, "sum op's output '{}' has no corresponding var".format(
+                        output_name)
+                    ref_fwd_var_name = grad_var_to_var[output_name]
+                    ref_fwd_var = vars[ref_fwd_var_name]
+                    ref_fwd_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_fwd_var)
+                    ref_fwd_dims_mapping = ref_fwd_dist_attr.dims_mapping
+                    ref_fwd_process_mesh = ref_fwd_dist_attr.process_mesh
+
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_fwd_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_fwd_process_mesh
+                    output_var = vars[output_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
 
-                # output
-                tensor_dist_attr = TensorDistributedAttribute()
-                tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping
-                tensor_dist_attr.process_mesh = ref_forward_var_process_mesh
-                self._dist_context.set_tensor_dist_attr_for_program(
-                    vars[grad_op.output_arg_names[0]], tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_fwd_process_mesh
+                    for var_name in grad_op.input_arg_names:
+                        grad_op_dist_attr.set_input_dims_mapping(
+                            var_name, ref_fwd_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(
+                        output_name, ref_fwd_dims_mapping)
+
+                elif grad_op.type == 'fill_zeros_like':
+                    ref_var_name = grad_op.input_arg_names[0]
+                    ref_var = vars[ref_var_name]
+                    ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        ref_var)
+                    ref_dims_mapping = ref_dist_attr.dims_mapping
+                    ref_process_mesh = ref_dist_attr.process_mesh
+                    # output
+                    tensor_dist_attr = TensorDistributedAttribute()
+                    tensor_dist_attr.dims_mapping = ref_dims_mapping
+                    tensor_dist_attr.process_mesh = ref_process_mesh
+                    output_var_name = grad_op.output_arg_names[0]
+                    output_var = vars[output_var_name]
+                    self._dist_context.set_tensor_dist_attr_for_program(
+                        output_var, tensor_dist_attr)
+                    # op
+                    grad_op_dist_attr = OperatorDistributedAttribute()
+                    grad_op_dist_attr.process_mesh = ref_process_mesh
+                    grad_op_dist_attr.set_input_dims_mapping(ref_var_name,
+                                                             ref_dims_mapping)
+                    grad_op_dist_attr.set_output_dims_mapping(output_var_name,
+                                                              ref_dims_mapping)
+
+                else:
+                    raise ValueError("got unexpect op [{}]".format(
+                        str(grad_op.type)))
 
-                # op
-                grad_op_dist_attr = OperatorDistributedAttribute()
-                grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh
-                for var_name in grad_op.input_arg_names:
-                    assert _get_forward_varname_from_grad_varname(
-                        var_name) == ref_forward_var_name
-                    grad_op_dist_attr.set_input_dims_mapping(
-                        var_name, ref_forward_var_dims_mapping)
-
-                grad_op_dist_attr.set_output_dims_mapping(
-                    grad_op.output_arg_names[0], ref_forward_var_dims_mapping)
                 self._dist_context.set_op_dist_attr_for_program(
                     grad_op, grad_op_dist_attr)
 
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 2807c46540ab1..7e245358d4bcc 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -120,6 +120,11 @@ def dist_main_programs(self):
     def dist_startup_programs(self):
         return self._dist_startup_programs
 
+    @property
+    def is_annotation(self):
+        return len(self._dist_tensors_for_program) or len(
+            self._dist_ops_for_program)
+
     def add_process_mesh(self, process_mesh):
         assert isinstance(process_mesh, ProcessMesh), \
             'The type of dim_mapping must be ProcessMesh.'
@@ -577,6 +582,7 @@ def __init__(self):
         self._cur_src_op = None
         self._cur_dist_attr = None
         self.grad_op_id_to_op_id = {}
+        self.grad_var_to_var = defaultdict(dict)
         self._work_block = None
         self.already_init_sync_vars = set()
         self.varname_mapping = None
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index 9449b52952cd8..cc08bc1a901b7 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -16,6 +16,7 @@
 import numpy as np
 import paddle
 from .utils import to_list
+from paddle.fluid.layers.utils import flatten
 from paddle.io import DataLoader, DistributedBatchSampler
 
 
@@ -56,16 +57,17 @@ def __init__(self,
                  data_parallel_world_size=None,
                  data_parallel_rank=None,
                  drop_last=False,
-                 inputs=[]):
+                 sample_generator=True):
         self.feed_list = feed_list
         self.places = places
         self.steps_per_epoch = steps_per_epoch
+        self._sample_generator = sample_generator
+
         super(NonIterableGeneratorLoader, self).__init__(
             dataset, batch_size, epochs, data_parallel_world_size,
             data_parallel_rank, drop_last)
         self._inner_dataloader = self._create_inner_dataloader()
         self._steps = self._infer_steps()
-        self._inputs = inputs
 
     def __iter__(self):
         self._cur_step = 0
@@ -91,27 +93,28 @@ def _infer_steps(self):
         return steps_per_epoch
 
     def _create_inner_dataloader(self):
-        def data_generator():
+        def sample_data_generator():
             batch_data = None
             for step, data in enumerate(self.dataset):
-                if not isinstance(data, list):
-                    data = to_list(data)
-
-                if self.batch_size == 1:
-                    yield data
+                data = flatten(data)
+                if batch_data is None:
+                    batch_data = [[] for i in range(len(data))]
+                for idx in range(len(data)):
+                    batch_data[idx].append(data[idx])
+                if (step + 1) % self.batch_size == 0:
+                    yield batch_data
                     batch_data = None
-                else:
-                    if batch_data is None:
-                        batch_data = [[] for i in range(len(data))]
-
-                    for idx in range(len(data)):
-                        batch_data[idx].append(data[idx])
 
-                    if (step + 1) % self.batch_size == 0:
-                        yield batch_data
-                        batch_data = None
+        def batch_data_generator():
+            for data in self.dataset:
+                data = flatten(data)
+                yield data
 
         dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=self.feed_list, capacity=70, iterable=False)
-        dataloader.set_batch_generator(data_generator, self.places)
+        if self._sample_generator:
+            dataloader.set_batch_generator(sample_data_generator, self.places)
+        else:
+            dataloader.set_batch_generator(batch_data_generator, self.places)
+
         return dataloader
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index a5fec789dfb37..2cd841ef80979 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -17,18 +17,22 @@
 from collections import defaultdict
 
 import paddle
+import paddle.distributed.auto_parallel as auto
+
 from paddle import fluid
 from paddle.io import Dataset
 from paddle.metric import Metric
 from paddle.static import InputSpec
 from paddle.fluid import core
 from paddle.fluid import program_guard
+from paddle.fluid.layers.utils import flatten
+from paddle.fluid.executor import global_scope
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Operator
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.passes import new_pass, PassContext
 from paddle.distributed.utils import get_logger
+from paddle.distributed.passes import new_pass, PassContext
 
 from .mapper import mapping
 from .cluster import Cluster
@@ -61,6 +65,12 @@ def __init__(self,
         self.strategy = strategy
 
         self._executor = None
+        self._cur_rank = paddle.distributed.get_rank()
+        self._nranks = paddle.distributed.get_world_size()
+        self._saver = DistributedSaver()
+        self._logger = get_logger(logging.INFO)
+
+        self._default_strategy = None
         self._orig_main_prog = fluid.default_main_program()
         self._orig_startup_prog = fluid.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
@@ -70,9 +80,6 @@ def __init__(self,
         self._dist_startup_progs = defaultdict(dict)  # dist startup programs
         self._dist_contexts = {}
         self._pass_contexts = {}
-        self._cur_rank = paddle.distributed.get_rank()
-        self._logger = get_logger(logging.INFO)
-        self._saver = DistributedSaver()
         self._feed_vars = {}
         self._fetch_vars = {}
 
@@ -86,13 +93,11 @@ def prepare(self,
         # TODO: check loss type
         self._loss = loss
         self._metrics = to_list(metrics)
-        for m in ['train', 'predict']:
-            self.mode = m
-            self._build(m)  # build forward program
-            self._plan(m)  # completion & planner
-            self._parallel(m, all_ranks)  # parallel
-            self._initialize(m)  # init comm and startup program
-        self.mode = mode
+        self._mode = mode
+        self._build(mode)  # build forward program
+        self._plan(mode)  # completion & planner
+        self._parallel(mode, all_ranks)  # parallel
+        self._initialize(mode)  # init comm and startup program
 
     def _build(self, mode):
         serial_main_prog = self._serial_main_progs.get(mode, None)
@@ -112,10 +117,16 @@ def _build(self, mode):
             if mode != "predict" and self._loss:
                 losses = to_list(self._loss(*(outputs + labels)))
 
+        default_ctx = get_default_distributed_context()
+        if not default_ctx.is_annotation or self._default_strategy:
+            inputs = [self._set_data_parallel(var) for var in inputs]
+            labels = [self._set_data_parallel(var) for var in labels]
+
+        # print(serial_main_prog)
         self._feed_vars[mode] = {"inputs": inputs, "labels": labels}
 
         self._fetch_vars[mode] = {
-            "outputs": outputs,
+            "outputs": flatten(outputs),
             "loss": losses,
             "metrics": metrics
         }
@@ -128,6 +139,12 @@ def _build(self, mode):
         self._pass_contexts[mode] = PassContext()
 
     def _plan(self, mode):
+
+        # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need
+        # dependency of backward-forward ops in forward completition.
+        defualt_ctx = get_default_distributed_context()
+        self._dist_contexts[mode]._dist_op_context = defualt_ctx.dist_op_context
+
         # Complete the distributed annotation
         serial_main_prog = self._serial_main_progs[mode]
         self._completer = Completer(self._dist_contexts[mode])
@@ -147,13 +164,14 @@ def _parallel(self, mode, all_ranks=False):
                 self._parallel_program(mode, rank)
 
     def _initialize(self, mode):
-        # Traverse different rank programs and traverse each op of them,
-        # instantiate communication by process_mapping.
-        all_process_groups = get_all_process_groups()
-        for process_group in all_process_groups:
-            if self._cur_rank not in process_group.ranks:
-                continue
-            process_group.instantiate()
+        if self._nranks > 1:
+            # Traverse different rank programs and traverse each op of them,
+            # instantiate communication by process_mapping.
+            all_process_groups = get_all_process_groups()
+            for process_group in all_process_groups:
+                if self._cur_rank not in process_group.ranks:
+                    continue
+                process_group.instantiate()
 
         # initialize
         self._place = _get_device()
@@ -161,8 +179,16 @@ def _initialize(self, mode):
             self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
         if self._executor is None:
             self._executor = paddle.static.Executor(self._place)
-        dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
-        self._executor.run(dist_startup_prog)
+            uninitialized = []
+            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            for var in dist_startup_prog.list_vars():
+                scope_var = global_scope().find_var(var.name)
+                if scope_var and scope_var.get_tensor()._is_initialized():
+                    continue
+                uninitialized.append(var)
+            if uninitialized:
+                prune_startup_prog = dist_startup_prog._prune(uninitialized)
+                self._executor.run(prune_startup_prog)
 
     def _parallel_program(self, mode, rank):
         serial_main_program = self._serial_main_progs[mode]
@@ -246,12 +272,13 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
             if config["use_pure_fp16"]:
                 config["base_opt"] = self._optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply(
-                    [main_program], [startup_program], self._pass_context)
+                auto_parallel_fp16_pass.apply([main_program],
+                                              [startup_program],
+                                              self._pass_contexts[self.mode])
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
                 auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                             self._pass_context)
+                                             self._pass_contexts[self.mode])
 
         # apply recompute pass
         if self.strategy.recompute:
@@ -288,18 +315,26 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
                 [main_program], [startup_program],
                 self._pass_contexts[self.mode])
 
-    def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=None):
+    def fit(self,
+            train_data,
+            batch_size=1,
+            epochs=1,
+            steps_per_epoch=None,
+            use_program_cache=False,
+            return_numpy=True,
+            sample_generator=True):
         # TODO: callbacks
         # TODO: evaluate after training
         self.mode = 'train'
-        assert isinstance(train_data, Dataset)
-        train_dataloader = self._create_dataloader(train_data, batch_size,
-                                                   epochs, steps_per_epoch)
+        assert self.mode in self._dist_main_progs, "train model is not ready, please call `engine.prepare(mode='train')` first."
+        train_dataloader = self._create_dataloader(
+            train_data, batch_size, epochs, steps_per_epoch, sample_generator)
 
         outputs = []
         for epoch in range(epochs):
             for step, data in enumerate(train_dataloader):
-                logs, loss = self._train_step(data)
+                logs, loss = self._train_step(data, use_program_cache,
+                                              return_numpy)
                 outputs.append(loss)
                 train_logs = {
                     "train_" + name: val
@@ -308,14 +343,35 @@ def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=None):
                 self._logger.info(train_logs)
         return outputs
 
+    def evaluate(self,
+                 eval_data,
+                 batch_size=1,
+                 use_program_cache=False,
+                 return_numpy=True,
+                 sample_generator=True):
+        self.mode = 'eval'
+        assert self.mode in self._dist_main_progs, "eval model is not ready, please call `engine.prepare(mode='eval')` first."
+        eval_dataloader = self._create_dataloader(
+            eval_data, batch_size, sample_generator=sample_generator)
+
+        outputs = []
+        for step, data in enumerate(eval_dataloader):
+            logs, outs = self._eval_step(data, use_program_cache, return_numpy)
+            outputs.append(outs)
+            predict_logs = {"eval_" + name: val for name, val in logs.items()}
+            self._logger.info(predict_logs)
+        return outputs
+
     def predict(self,
                 test_data,
                 batch_size=1,
                 use_program_cache=False,
-                return_numpy=True):
+                return_numpy=True,
+                sample_generator=True):
         self.mode = 'predict'
-        # TODO: need check dataset
-        test_dataloader = self._create_dataloader(test_data, batch_size)
+        assert self.mode in self._dist_main_progs, "predict model is not ready, please call `engine.prepare(mode='predict')` first."
+        test_dataloader = self._create_dataloader(
+            test_data, batch_size, sample_generator=sample_generator)
 
         outputs = []
         for step, data in enumerate(test_dataloader):
@@ -329,19 +385,39 @@ def predict(self,
             self._logger.info(predict_logs)
         return outputs
 
-    def _train_step(self, data):
+    def _train_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
         fetch_var = self._fetch_vars[self.mode]["loss"][0]
         if fetch_var.name not in dist_main_prog.global_block().vars:
-            loss = self._executor.run(dist_main_prog)
+            loss = self._executor.run(dist_main_prog,
+                                      use_program_cache=use_program_cache)
             logs["loss"] = None
         else:
             loss = self._executor.run(dist_main_prog,
-                                      fetch_list=to_list(fetch_var))
+                                      fetch_list=to_list(fetch_var),
+                                      use_program_cache=use_program_cache,
+                                      return_numpy=return_numpy)
             logs["loss"] = loss
         return logs, loss
 
+    def _eval_step(self, data, use_program_cache=False, return_numpy=True):
+        logs = {}
+        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
+        fetch_var = self._fetch_vars[self.mode]["loss"][0]
+
+        if fetch_var.name not in dist_main_prog.global_block().vars:
+            outs = self._executor.run(dist_main_prog,
+                                      use_program_cache=use_program_cache)
+            logs["loss"] = outs
+        else:
+            outs = self._executor.run(dist_main_prog,
+                                      fetch_list=fetch_var,
+                                      use_program_cache=use_program_cache,
+                                      return_numpy=return_numpy)
+            logs["loss"] = outs
+        return logs, outs
+
     def _predict_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
@@ -366,7 +442,8 @@ def _create_dataloader(self,
                            dataset,
                            batch_size,
                            epochs=1,
-                           steps_per_epoch=None):
+                           steps_per_epoch=None,
+                           sample_generator=True):
         feed_list = self._feed_vars[self.mode]["inputs"] + self._feed_vars[
             self.mode]["labels"]
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
@@ -376,9 +453,12 @@ def _create_dataloader(self,
         serial_main_prog = self._serial_main_progs[self.mode]
         serial_main_block = serial_main_prog.global_block()
         op_size = len(dist_main_block.ops)
+        if dist_main_block.ops[0].type == 'create_py_reader':
+            op_size -= 3
+            for _ in range(3):
+                dist_main_block._remove_op(0, sync=False)
         places = paddle.static.cuda_places()
         with fluid.program_guard(dist_main_prog, dist_startup_prog):
-            inputs = self._feed_vars[self.mode]["inputs"]
             dataloader = NonIterableGeneratorLoader(
                 dataset,
                 feed_list,
@@ -386,7 +466,7 @@ def _create_dataloader(self,
                 batch_size,
                 epochs,
                 steps_per_epoch,
-                inputs=inputs)
+                sample_generator=sample_generator)
         new_op_size = len(dist_main_block.ops)
         for _ in range(new_op_size - 1, op_size - 1, -1):
             op = dist_main_block.ops[new_op_size - 1]
@@ -396,7 +476,7 @@ def _create_dataloader(self,
                 dist_main_block, new_op_desc, type=new_op_desc.type())
             dist_main_block.ops.insert(0, new_op)
             for in_name in new_op.input_arg_names:
-                if in_name == "lod_tensor_blocking_queue_0":
+                if "lod_tensor_blocking_queue" in in_name:
                     continue
                 if in_name not in dist_main_block.vars:
                     in_var = serial_main_block._var_recursive(in_name)
@@ -424,6 +504,27 @@ def _validate_spec(self, specs):
                         .format(i, spec))
         return specs
 
+    def _set_data_parallel(self, var):
+        if self._nranks == 1:
+            self._default_strategy = 'serial'
+            auto.shard_tensor(
+                var,
+                dist_attr={
+                    "process_mesh": [0],
+                    "dims_mapping": [-1 for _ in range(len(var.shape))]
+                })
+        else:
+            self._default_strategy = 'dp'
+            auto.shard_tensor(
+                var,
+                dist_attr={
+                    "process_mesh": list(range(self._nranks)),
+                    "dims_mapping":
+                    [0] + [-1 for _ in range(len(var.shape) - 1)]
+                })
+
+        return var
+
     def save(self, path, training=True, mode=None):
         if not mode:
             mode = self.mode
@@ -459,3 +560,35 @@ def load(self, path, strict=True, load_optimizer=True, mode=None):
         dist_context = self._dist_contexts[mode]
         self._saver.load(path, dist_main_prog, dist_context, strict,
                          load_optimizer)
+
+    @property
+    def mode(self):
+        return self._mode
+
+    @mode.setter
+    def mode(self, mode):
+        self._mode = mode
+
+    @property
+    def metrics(self):
+        return self._metrics
+
+    @property
+    def main_program(self):
+        return self._dist_main_progs[self.mode][self._cur_rank]
+
+    @property
+    def startup_program(self):
+        return self._dist_startup_progs[self.mode][self._cur_rank]
+
+    @property
+    def dist_context(self):
+        return self._dist_contexts[self.mode]
+
+    @property
+    def serial_main_program(self):
+        return self._serial_main_progs[self.mode]
+
+    @property
+    def serial_startup_program(self):
+        return self._serial_startup_progs[self.mode]
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 9fb200f4d2db9..4795050d15dcc 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -53,6 +53,10 @@ def __init__(self, name):
     def is_input_compatible(self, dist_op):
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
+        input_names = op_desc.input_names()
+        xshape_arg_names = []
+        if "XShape" in input_names:
+            xshape_arg_names = op_desc.input("XShape")
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
@@ -63,10 +67,18 @@ def is_input_compatible(self, dist_op):
                 # continue
                 # if len(dims_mapping) < 1:
                 #     continue
-            if len(dims_mapping) > 1:
-                for mapping in dims_mapping[1:]:
-                    if mapping != -1:
-                        return False
+            if arg_name not in xshape_arg_names:
+                if len(dims_mapping) > 1:
+                    for mapping in dims_mapping[1:]:
+                        if mapping != -1:
+                            return False
+            else:
+                if dims_mapping[0] != -1:
+                    return False
+                if len(dims_mapping) > 2:
+                    for mapping in dims_mapping[2:]:
+                        if mapping != -1:
+                            return False
         return True
 
     def is_output_compatible(self, dist_op):
@@ -105,17 +117,31 @@ def is_auto_compatible(self, dist_op):
         op_dist_attr = dist_op.dist_attr
         batch_dim_mappings = []
         # Check input compatibility
+        input_names = op_desc.input_names()
+        xshape_arg_names = []
+        if "XShape" in input_names:
+            xshape_arg_names = op_desc.input("XShape")
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if len(dims_mapping) > 1:
-                for mapping in dims_mapping[1:]:
-                    if mapping != -1:
-                        return False
-            if len(dims_mapping) >= 1:
-                batch_dim_mappings.append(dims_mapping[0])
+            if arg_name not in xshape_arg_names:
+                if len(dims_mapping) > 1:
+                    for mapping in dims_mapping[1:]:
+                        if mapping != -1:
+                            return False
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
+            else:
+                if dims_mapping[0] != -1:
+                    return False
+                if len(dims_mapping) > 2:
+                    for mapping in dims_mapping[2:]:
+                        if mapping != -1:
+                            return False
+                if len(dims_mapping) >= 2:
+                    batch_dim_mappings.append(dims_mapping[1])
 
         # Check output compatibility
         output_names = op_desc.output_names()
@@ -160,24 +186,39 @@ def update_dims_mapping(self, dist_op):
             or op_desc.type() == "slice" \
                 or op_desc.type() == "while":
             return False
+
+        input_names = op_desc.input_names()
+        input_xshape_arg_names = []
+        if "XShape" in input_names:
+            input_xshape_arg_names = op_desc.input("XShape")
+
         output_names = op_desc.output_names()
-        xshape_arg_names = []
+        output_xshape_arg_names = []
         if "XShape" in output_names:
-            xshape_arg_names = op_desc.output("XShape")
+            output_xshape_arg_names = op_desc.output("XShape")
+
         batch_dim_mappings = []
         for arg_name in op_desc.input_arg_names():
             serial_tensor = dist_op.get_serial_input(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if len(dims_mapping) >= 1:
-                batch_dim_mappings.append(dims_mapping[0])
+            if arg_name not in input_xshape_arg_names:
+                if len(dims_mapping) >= 1:
+                    batch_dim_mappings.append(dims_mapping[0])
+            else:
+                batch_dim_mappings.append(dims_mapping[1])
         for arg_name in op_desc.output_arg_names():
+            if op_desc.type() == "fill_zeros_like":
+                input_tensor = dist_op.get_serial_input(op_desc.input_arg_names(
+                )[0])
+                if input_tensor.is_parameter:
+                    continue
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if arg_name not in xshape_arg_names:
+            if arg_name not in output_xshape_arg_names:
                 if len(dims_mapping) >= 1:
                     batch_dim_mappings.append(dims_mapping[0])
             else:
@@ -194,16 +235,27 @@ def update_dims_mapping(self, dist_op):
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-            if len(dims_mapping
-                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
-                dims_mapping[0] = compatible_dim_mapping
-                changed = True
+            if arg_name not in input_xshape_arg_names:
+                if len(dims_mapping) >= 1 and \
+                    compatible_dim_mapping != dims_mapping[0]:
+                    dims_mapping[0] = compatible_dim_mapping
+                    changed = True
+            else:
+                if len(dims_mapping) >= 2 and \
+                    compatible_dim_mapping != dims_mapping[1]:
+                    dims_mapping[1] = compatible_dim_mapping
+                    changed = True
         for arg_name in op_desc.output_arg_names():
+            if op_desc.type() == "fill_zeros_like":
+                input_tensor = dist_op.get_serial_input(op_desc.input_arg_names(
+                )[0])
+                if input_tensor.is_parameter:
+                    continue
             serial_tensor = dist_op.get_serial_output(arg_name)
             if serial_tensor.is_parameter:
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-            if arg_name not in xshape_arg_names:
+            if arg_name not in output_xshape_arg_names:
                 if len(dims_mapping
                        ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
                     dims_mapping[0] = compatible_dim_mapping
@@ -371,30 +423,14 @@ def backward(ctx, *args, **kwargs):
 
         if need_gradient_allreduce:
             allreduce_vars = []
-            for input_name in backward_op.desc.input_names():
-                for varname in backward_op.desc.input(input_name):
-                    if "@GRAD" not in varname and is_parameter_related(
-                            varname, main_block):
-                        # NOTE: When amp and recompute pass are effective at the same time,
-                        # if a parameter is casted and recomputed, the 'parameter@GARD' can not
-                        # be found in the grad_op's output.
-                        if "subprog_" in varname:
-                            varname = varname[:varname.index(".subprog_")]
-
-                        assert len(
-                            backward_op.desc.input(input_name)
-                        ) == 1, "parameter input to grad op should be length 1, but got [{}]".format(
-                            backward_op.desc.input(input_name))
-
-                        assert varname + "@GRAD" in backward_op.desc.output_arg_names(
-                        ), "parameter's grad [{}] not found in the grad op's output".format(
-                            varname + "@GRAD")
-                        assert len(
-                            backward_op.desc.output(input_name + "@GRAD")
-                        ) == 1, "parameter grad of grad op should be length 1, but got [{}]".format(
-                            backward_op.desc.output(input_name + "@GRAD"))
-                        allreduce_vars.append(
-                            backward_op.desc.output(input_name + "@GRAD")[0])
+            for output_name in backward_op.desc.output_names():
+                for varname in backward_op.desc.output(output_name):
+                    if varname in kwargs["grad_var_to_var"]:
+                        fwd_name = kwargs["grad_var_to_var"][varname]
+                        if fwd_name not in main_block.vars:
+                            continue
+                        if is_parameter_related(fwd_name, main_block):
+                            allreduce_vars.append(varname)
 
             if len(allreduce_vars) > 0:
 
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index c03ef9c06d80f..fe091cd08b72b 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -25,7 +25,7 @@
 from .dist_attribute import OperatorDistributedAttribute
 from .process_group import new_process_group
 from .utils import set_dist_op_desc_original_id
-from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op
+from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
 __varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
@@ -198,15 +198,29 @@ def partition_block(self, ref_block, target_block):
         dist_op_context = self._dist_context.dist_op_context
         serial_ops = ref_block.ops
 
+        last_fwd_op_idx = -1
+        for idx, op in enumerate(ref_block.ops):
+            if is_loss_op(op):
+                last_fwd_op_idx = idx
+                break
+
+        if last_fwd_op_idx == -1:
+            last_fwd_op_idx = len(ref_block.ops)
+
         # init mapping
         forward_op_id2forward_op = {}
         for idx in range(len(serial_ops)):
-            if is_forward_op(serial_ops[idx]):
+            if idx <= last_fwd_op_idx:
                 forward_op_id2forward_op[serial_ops[idx].desc.id(
                 )] = serial_ops[idx]
 
+        appended_grad_times = 0
         # partiiton
-        for op in serial_ops:
+        for idx, op in enumerate(serial_ops):
+
+            if is_backward_op(op) and (is_forward_op(serial_ops[idx - 1]) or
+                                       is_loss_op(serial_ops[idx - 1])):
+                appended_grad_times += 1
 
             # partititon input variables
             for serial_input_varname in op.desc.input_arg_names():
@@ -244,8 +258,11 @@ def partition_block(self, ref_block, target_block):
                 kinputs, koutputs = dist_op_context.prepare_context(op)
                 dist_op_backward_impl = _get_dist_op_backward_implement(
                     op, self._dist_context, forward_op_id2forward_op)
-                dist_op_backward_impl.backward(self._dist_context, **kinputs,
-                                               **koutputs)
+                grad_var_to_var = self._dist_context.dist_op_context.grad_var_to_var[
+                    appended_grad_times]
+                dist_op_backward_impl.backward(
+                    self._dist_context, **kinputs, **koutputs,
+                    **{"grad_var_to_var": grad_var_to_var})
             else:
                 raise NotImplementedError(
                     "partitioner only support forward op and backward op, but got {}".
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index fc85cd04d4010..9c40034498dbc 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -996,69 +996,87 @@ def set_grad_var_shape(program, dist_context):
 
     block = program.global_block()
     vars = block.vars
-    for op in block.ops:
+    appended_grad_times = 0
+    grad_var_to_var = dist_context.dist_op_context.grad_var_to_var
+
+    for idx, op in enumerate(block.ops):
+
+        if int(op.attr('op_role')) != int(OpRole.Backward):
+            continue
+
+        if int(block.ops[idx-1].attr('op_role')) == int(OpRole.Forward) or \
+            int(block.ops[idx-1].attr('op_role')) == 257:
+            appended_grad_times += 1
 
         if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
             break
 
-        if op.type in ["sum", "concat"]:
+        if op.type in ["sum", "concat", "shape"]:
             continue
-        if int(op.attr('op_role')) == int(OpRole.Backward):
-            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
-            assert op_dist_attr is not None
 
-            for var_name in op.output_arg_names:
-                if "@GRAD" not in var_name:
-                    continue
+        op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+        assert op_dist_attr is not None
+
+        for var_name in op.output_arg_names:
+
+            if "@GRAD" not in var_name:
+                continue
+            if var_name in grad_var_to_var[appended_grad_times]:
+                forward_var_name = grad_var_to_var[appended_grad_times][
+                    var_name]
+            else:
                 forward_var_name = var_name[:var_name.find("@GRAD")]
-                if op.type in [
-                        "c_allreduce_sum", "c_identity", "scale", "cast"
-                ]:
-                    forward_var_name = op.input_arg_names[0]
-                elif op.type == "matmul_v2_grad":
-                    forward_var_name = None
-                    for output_name in op.output_names:
-                        if var_name in op.output(output_name):
-                            assert "@GRAD" in output_name
-                            input_name = output_name[:output_name.find("@GRAD")]
-                            assert len(op.input(input_name)) == 1
-                            forward_var_name = op.input(input_name)[0]
-                    assert forward_var_name is not None
-
-                need_set_shape_list = [
-                    "reshape2_grad", "softmax_with_cross_entropy_grad",
-                    "transpose2_grad", "softmax_grad", "cross_entropy_grad2",
-                    "dropout_grad"
-                ]
-                forward_list = [
-                    "reshape2", "softmax_with_cross_entropy", "transpose2",
-                    "softmax", "cross_entropy2", "dropout"
-                ]
-                if op.type in need_set_shape_list:
-                    for forward_op in block.ops:
-                        assert int(forward_op.attr('op_role')) != int(
-                            OpRole.Backward)
-                        idx = need_set_shape_list.index(op.type)
-                        forward_op_name = forward_list[idx]
-                        if forward_op.type == forward_op_name and forward_var_name in forward_op.input_arg_names:
-                            op_dist_attr = dist_context.get_op_dist_attr_for_program(
-                                forward_op)
-                            break
-
-                forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                    forward_var_name)
-                assert forward_input_dist_attr is not None, f"{forward_var_name, str(op)}"
-                forward_var = vars[forward_var_name]
-                forward_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                    forward_var)
-                assert forward_var_dist_attr is not None
-                grad_var = vars[var_name]
-                ref_shape = infer_shape(block, forward_var,
-                                        forward_var_dist_attr,
-                                        forward_input_dist_attr)
-
-                if list(grad_var.shape) != ref_shape:
-                    grad_var.desc.set_shape(ref_shape)
+
+            if op.type in [
+                    "c_allreduce_sum", "c_identity", "scale", "cast",
+                    "fill_zeros_like"
+            ]:
+                forward_var_name = op.input_arg_names[0]
+            elif op.type == "matmul_v2_grad":
+                forward_var_name = None
+                for output_name in op.output_names:
+                    if var_name in op.output(output_name):
+                        assert "@GRAD" in output_name
+                        input_name = output_name[:output_name.find("@GRAD")]
+                        assert len(op.input(input_name)) == 1
+                        forward_var_name = op.input(input_name)[0]
+                assert forward_var_name is not None
+
+            need_set_shape_list = [
+                "reshape2_grad", "softmax_with_cross_entropy_grad",
+                "transpose2_grad", "softmax_grad", "cross_entropy_grad2",
+                "dropout_grad", "tanh_grad", "slice", "assign",
+                "matmul_v2_triple_grad", "elementwise_add_triple_grad",
+                "fill_constant", "sqrt_grad"
+            ]
+            forward_list = [
+                "reshape2", "softmax_with_cross_entropy", "transpose2",
+                "softmax", "cross_entropy2", "dropout", "tanh",
+                ["slice_grad", "c_allgather"], "assign", "matmul_v2_grad_grad",
+                "elementwise_add_grad_grad", "shape", "sqrt"
+            ]
+            if op.type in need_set_shape_list:
+                for forward_op in block.ops:
+                    idx = need_set_shape_list.index(op.type)
+                    forward_op_name = forward_list[idx]
+                    if forward_op.type in forward_op_name and forward_var_name in forward_op.input_arg_names:
+                        op_dist_attr = dist_context.get_op_dist_attr_for_program(
+                            forward_op)
+                        break
+
+            forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
+                forward_var_name)
+            assert forward_input_dist_attr is not None, f"{forward_var_name, str(op)}"
+            forward_var = vars[forward_var_name]
+            forward_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
+                forward_var)
+            assert forward_var_dist_attr is not None
+            grad_var = vars[var_name]
+            ref_shape = infer_shape(block, forward_var, forward_var_dist_attr,
+                                    forward_input_dist_attr)
+
+            if list(grad_var.shape) != ref_shape:
+                grad_var.desc.set_shape(ref_shape)
 
 
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index b2d146297de8a..e33a3dba669ab 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -263,7 +263,7 @@ def _new_process_group_impl(backend,
             rank=global_rank,
             world_size=global_world_size,
             place=place,
-            gid=0,
+            gid=group_id,
             local_rank=rank,
             local_size=world_size,
             gloo_rank=cluster_id,
diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py
index 6fb6a5ca32b3c..ea3dc43d0c712 100644
--- a/python/paddle/distributed/models/moe/utils.py
+++ b/python/paddle/distributed/models/moe/utils.py
@@ -14,8 +14,9 @@
 
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
 from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle import _C_ops
 
 
 def _number_count(numbers, upper_range):
@@ -40,7 +41,9 @@ def _number_count(numbers, upper_range):
             number_count = paddle.distributed.utils.number_count(numbers, upper_range)
             print(number_count) # the result: [2, 0, 2, 0, 0, 0]
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.number_count(numbers, 'upper_range', upper_range)
+    elif _in_legacy_dygraph():
         return core.ops.number_count(numbers, 'upper_range', upper_range)
     else:
         op_type = 'number_count'
@@ -86,7 +89,9 @@ def _assign_pos(x, cum_count):
             pos = paddle.distributed.utils.assign_pos(x=numbers, cum_count=num_cum)
             print(pos) # the result: (2, 0, 3, 1)
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.assign_pos(x, cum_count, cum_count[-1])
+    elif _in_legacy_dygraph():
         return core.ops.assign_pos(x, cum_count, cum_count[-1])
     else:
         op_type = 'assign_pos'
@@ -120,7 +125,9 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
             prob: random prob, shape=(topk_idx.shape[0],)
     """
     if topk == 2:
-        if _non_static_mode():
+        if in_dygraph_mode():
+            return _C_ops.random_routing(prob, topk_value, topk_idx)
+        elif _in_legacy_dygraph():
             return core.ops.random_routing(prob, topk_value, topk_idx)
         else:
             raise RuntimeError("Not supporting static mode now")
@@ -149,7 +156,10 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
             out = paddle.distributed.utils.limit_by_capacity(expert_count, capacity, n_work)
             print(out) # the result: [1, 2, 2, 4, 3, 3]
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.limit_by_capacity(expert_count, capacity, 'n_worker',
+                                        n_worker)
+    elif _in_legacy_dygraph():
         return core.ops.limit_by_capacity(expert_count, capacity, 'n_worker',
                                           n_worker)
     else:
@@ -192,8 +202,10 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
             # Tensor(shape=[8], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
               [1, 3, 3, 3, -1, 2, 1, 1])
     """
-
-    if _non_static_mode():
+    if in_dygraph_mode():
+        return _C_ops.prune_gate_by_capacity(gate_idx, expert_count, "n_expert",
+                                             n_expert, "n_worker", n_worker)
+    elif _in_legacy_dygraph():
         return core.ops.prune_gate_by_capacity(
             gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker)
     check_variable_and_dtype(gate_idx, 'GateIdx', ['int32', 'int64'],
diff --git a/python/paddle/distribution/multinomial.py b/python/paddle/distribution/multinomial.py
index c4110040fd192..837eb53eab1ea 100644
--- a/python/paddle/distribution/multinomial.py
+++ b/python/paddle/distribution/multinomial.py
@@ -16,6 +16,10 @@
 
 import paddle
 from paddle.distribution import categorical, distribution
+try:
+    from collections.abc import Iterable
+except:
+    from collections import Iterable
 
 
 class Multinomial(distribution.Distribution):
@@ -138,7 +142,7 @@ def sample(self, shape=()):
         Args:
             sample_shape (tuple, optional): [description]. Defaults to ().
         """
-        if not isinstance(shape, collections.Iterable):
+        if not isinstance(shape, Iterable):
             raise TypeError('sample shape must be Iterable object.')
 
         samples = self._categorical.sample([self.total_count, ] + list(shape))
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index c7e69753b5335..bc53c130286aa 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -28,6 +28,10 @@
 import paddle.fluid
 from .data_feeder import check_type
 import warnings
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 __all__ = [
     'append_backward',
     'gradients',
@@ -474,12 +478,16 @@ def _accumulate_gradients_by_add_ops_(var_name,
     renamed_vars[var_name] = [var_name]
 
 
-def _addup_repetitive_outputs_(op_descs, block_idx):
+def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
     """
     In backward part, an variable may be the output of more than one ops.
     And one op may yield its multiple outputs to the same variable.
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
+
+    Args:
+        grad_var_to_var(dict): used to build the mapping between grad var name and forward var name.
+        Only for auto parallel.
     """
     _MAX_ADD_NUM_ = framework._global_flags()['FLAGS_max_inplace_grad_add']
     #pending_sum_ops = []
@@ -527,6 +535,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                         new_name = var_name + "@RENAME@block" + str(block_idx) + "@" + \
                             str(var_rename_count[var_name])
                         var_rename_count[var_name] += 1
+                        # Build the mapping between the new_name and var_name (Only for auto parallel)
+                        if grad_var_to_var is not None:
+                            if var_name in grad_var_to_var:
+                                grad_var_to_var[new_name] = grad_var_to_var[
+                                    var_name]
+                            else:
+                                grad_var_to_var[new_name] = var_name
                         # rename original var_name
                         renamed_vars[var_name][0] = new_name
                         # before change: _rename_arg_(op_descs, var_name,
@@ -553,6 +568,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                     new_name = var_name + "@RENAME@block" + str(block_idx) + "@" + \
                         str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
+                    # Build the mapping between the new_name and var_name (Only for auto parallel)
+                    if grad_var_to_var is not None:
+                        if var_name in grad_var_to_var:
+                            grad_var_to_var[new_name] = grad_var_to_var[
+                                var_name]
+                        else:
+                            grad_var_to_var[new_name] = var_name
                     arg_names[arg_idx] = new_name
                     op_desc.set_output(param_name, arg_names)
                     renamed_vars[var_name].append(new_name)
@@ -1077,6 +1099,16 @@ def _append_backward_ops_(block,
         rename_var_map(dict): used to associate target_grad var name with first grad_op input name.
             Only used in for high order gradient.
     """
+
+    # Build the mapping between the forward op and backward op (Only for auto parallel)
+    def update_distop_context(distop_context, op_grad_to_var,
+                              appending_grad_times):
+        distop_context.grad_var_to_var[appending_grad_times].update(
+            op_grad_to_var)
+        for op_desc in grad_op_desc:
+            assert op_desc.id() not in distop_context.grad_op_id_to_op_id
+            distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+
     if callbacks is not None:
         assert (isinstance(callbacks, (list, tuple)))
         for cb in callbacks:
@@ -1114,11 +1146,18 @@ def _append_backward_ops_(block,
         # Getting op's corresponding grad_op
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
+
         # Build the mapping between the forward op and backward op (Only for auto parallel)
         if distop_context is not None:
-            for op_desc in grad_op_desc:
-                assert op_desc.id() not in distop_context.grad_op_id_to_op_id
-                distop_context.grad_op_id_to_op_id[op_desc.id()] = op.desc.id()
+            update_distop_context(distop_context, op_grad_to_var,
+                                  program._appending_grad_times)
+        else:
+            default_ctx = getattr(paddle.distributed.auto_parallel.dist_context,
+                                  '_g_default_distributed_context', None)
+            if default_ctx is not None:
+                distop_context = default_ctx.dist_op_context
+                update_distop_context(distop_context, op_grad_to_var,
+                                      program._appending_grad_times)
 
         # Set device for grad_op according to forward Op
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
@@ -1151,6 +1190,11 @@ def _append_backward_ops_(block,
                         rename_var_map[name] = new_name
 
                         if name in op_grad_to_var:
+                            # Build the mapping between the grad var name and var name (Only for auto parallel)
+                            if distop_context is not None:
+                                distop_context.grad_var_to_var[
+                                    program._appending_grad_times][
+                                        new_name] = op_grad_to_var[name]
                             op_grad_to_var[new_name] = op_grad_to_var[name]
                             op_grad_to_var.pop(name)
 
@@ -1183,8 +1227,14 @@ def _append_backward_ops_(block,
             grad_op_descs.extend(grad_op_desc)
             grad_to_var.update(op_grad_to_var)
 
+    # record mapping bewteen grad var name and var name (Only for auto parallel)
+    grad_var_to_var = None
+    if distop_context is not None:
+        grad_var_to_var = distop_context.grad_var_to_var[
+            program._appending_grad_times]
     # sum parameter's gradients' var given multiple var gradient
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx)
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx,
+                                               grad_var_to_var)
 
     # if all outputs of the grad op are in no_grad_set, then just remove and fill zero
     # if all inputs of the grad op are in no_grad_set, just remove this op
@@ -1722,7 +1772,7 @@ def append_backward(loss,
 def _as_list(x):
     if x is None:
         return []
-    return list(x) if isinstance(x, collections.Sequence) else [x]
+    return list(x) if isinstance(x, Sequence) else [x]
 
 
 def _is_ancestor_block(ancestor_block, block):
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index d21b7e4740a6e..47c64ff8bd605 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -1001,9 +1001,6 @@ def compile(self, feed_list, fetch_list):
             a_pass.set('custom_ops', self._custom_op_names)
         a_pass.apply(self._graph)
 
-        a_pass = core.get_pass("transfer_cast_op_pass")
-        a_pass.apply(self._graph)
-
         passes = [
             'ipu_inplace_pass',
             'ipu_graph_builder_pass',
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 588eb2a29f555..c5b9b9e71f6be 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -129,9 +129,13 @@ def update_loss_scaling(x,
         'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
         'incr_ratio': incr_ratio,
         'decr_ratio': decr_ratio,
-        'stop_update': stop_update
     }
 
+    if isinstance(stop_update, Variable):
+        inputs['StopUpdate'] = stop_update
+    else:
+        attrs['stop_update'] = stop_update
+
     helper.append_op(
         type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index c6e2bcb8b1a24..c3720396e1d77 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -432,7 +432,7 @@ def _add_dynamic_loss_scaling(self, params_grads, found_inf):
                     self._decr_every_n_nan_or_inf,
                     self._incr_ratio,
                     self._decr_ratio,
-                    stop_update=False,
+                    stop_update=self._optimizer._get_stop_update_var(),
                     name="update_loss_scaling")
             return
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
index 14282df23d365..1f7a01f17b066 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/fuse_utils.py
@@ -28,6 +28,27 @@ def forward(self, input):
         return input
 
 
+def fuse_conv_bn(model):
+    is_train = False
+    if model.training:
+        model.eval()
+        is_train = True
+    fuse_list = []
+    tmp_pair = [None, None]
+    for name, layer in model.named_sublayers():
+        if isinstance(layer, nn.Conv2D):
+            tmp_pair[0] = name
+        if isinstance(layer, nn.BatchNorm2D):
+            tmp_pair[1] = name
+
+        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:
+            fuse_list.append(tmp_pair)
+            tmp_pair = [None, None]
+    model = fuse_layers(model, fuse_list)
+    if is_train:
+        model.train()
+
+
 def fuse_layers(model, layers_to_fuse, inplace=False):
     '''
        fuse layers in layers_to_fuse
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 059cb7b0dd1bf..d5c3d9ab82d74 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -20,6 +20,7 @@
 import warnings
 
 import paddle
+import paddle.nn as nn
 import paddle.nn.quant.quant_layers as quant_layers
 from paddle.fluid import dygraph, core, framework, unique_name
 from paddle.fluid.framework import IrGraph
@@ -32,6 +33,7 @@
 from paddle.fluid.log_helper import get_logger
 from .. import quantization_pass
 from . import utils
+from . import fuse_utils
 
 __all__ = ['ImperativeQuantAware']
 
@@ -52,6 +54,7 @@ def __init__(
             weight_bits=8,
             activation_bits=8,
             moving_rate=0.9,
+            fuse_conv_bn=False,
             weight_preprocess_layer=None,
             act_preprocess_layer=None,
             weight_quantize_layer=None,
@@ -76,6 +79,7 @@ def __init__(
             activation_bits(int): quantization bit number for activations.
             moving_rate(float): the parameter for 'moving_average_abs_max'
                 quantization.
+            fuse_conv_bn(bool): Whether to fuse conv and bn, default is False.
             weight_preprocess_layer(paddle.nn.Layer, optional): A paddle
                 Layer that defines how to preprocess weight before quantization.
                 Using this can quickly test if user's preprocess method works
@@ -188,6 +192,7 @@ def forward(self, inputs):
                 model_path="./imperative_model_qat")
         """
         super(ImperativeQuantAware, self).__init__()
+        self.fuse_conv_bn = fuse_conv_bn
 
         kwargs = {
             "quantizable_layer_type": quantizable_layer_type,
@@ -256,8 +261,13 @@ def forward(self, inputs):
         """
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
+
+        if self.fuse_conv_bn:
+            fuse_utils.fuse_conv_bn(model)
+
         self._quantize_inputs.apply(model)
         self._quantize_outputs.apply(model)
+        return model
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         self._quantize_outputs.save_quantized_model(layer, path, input_spec,
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index a4c7a2a2bf8df..d4c34efb7b900 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -126,6 +126,7 @@ def __init__(self,
                  onnx_format=False,
                  optimize_model=False,
                  is_use_cache_file=False,
+                 skip_tensor_list=None,
                  cache_dir=None):
         '''
         Constructor.
@@ -198,6 +199,7 @@ def __init__(self,
                 the model accuracy is usually higher when it is 'channel_wise_abs_max'.
             onnx_format(bool): Whether to export the quantized model with format of ONNX.
                 Default is False.
+            skip_tensor_list(list): List of skip quant tensor name.
             optimize_model(bool, optional): If set optimize_model as True, it applies
                 some passes to the model before quantization, and it supports
                 `conv2d/depthwise_conv2d + bn` pass so far. Some targets require the
@@ -301,6 +303,7 @@ def __init__(self,
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
         self._onnx_format = onnx_format
+        self._skip_tensor_list = skip_tensor_list
         self._is_full_quantize = is_full_quantize
         if is_full_quantize:
             self._quantizable_op_type = self._support_quantize_op_type
@@ -547,6 +550,12 @@ def collect_var_name(var_name_list, persistable_var_names, op_type):
         persistable_var_names = _all_persistable_var_names(self._program)
         for block_id in range(len(self._program.blocks)):
             for op in self._program.blocks[block_id].ops:
+                # skip quant form self._skip_tensor_list
+                if self._skip_tensor_list is not None:
+                    for inp_name in utils._get_op_input_var_names(op):
+                        if inp_name in self._skip_tensor_list:
+                            op._set_attr("op_namescope", "skip_quant")
+
                 op_type = op.type
                 if self._is_full_quantize and \
                     op_type not in self._quantizable_op_type:
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 30e2b4613b185..0140283b915ff 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -354,6 +354,7 @@ set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
 set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat_fuse PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_qat_user_defined PROPERTIES TIMEOUT 200)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 015ecb3d4a4e9..0d035390e2c00 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -56,13 +56,15 @@ def set_vars(self):
         self.onnx_format = False
         self.check_export_model_accuracy = True
         self.diff_threshold = 0.01
+        self.fuse_conv_bn = False
 
     def func_qat(self):
         self.set_vars()
 
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type=self.weight_quantize_type,
-            activation_quantize_type=self.activation_quantize_type)
+            activation_quantize_type=self.activation_quantize_type,
+            fuse_conv_bn=self.fuse_conv_bn)
 
         with fluid.dygraph.guard():
             # For CI coverage
@@ -214,6 +216,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = True
         self.diff_threshold = 0.025
+        self.fuse_conv_bn = False
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index ff40b170345a8..94e0681d1f57e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -43,6 +43,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.diff_threshold = 0.01
         self.onnx_format = False
+        self.fuse_conv_bn = False
         print('weight_quantize_type', self.weight_quantize_type)
 
 
@@ -52,6 +53,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = True
         self.diff_threshold = 0.025
+        self.fuse_conv_bn = False
         print('weight_quantize_type', self.weight_quantize_type)
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
new file mode 100644
index 0000000000000..d580eb7ae7aef
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
@@ -0,0 +1,50 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.log_helper import get_logger
+
+from test_imperative_qat import TestImperativeQat
+
+paddle.enable_static()
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class TestImperativeQatfuseBN(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        self.diff_threshold = 0.01
+        self.onnx_format = False
+        self.fuse_conv_bn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
index 85cabb6b5e9b7..89e0e099f44c2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -247,21 +247,21 @@ def run_test(self,
         self.assertLess(delta_value, diff_threshold)
 
 
-class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
-    def test_post_training_kl(self):
+class TestPostTrainingAvgForLSTM(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
         model_name = "nlp_lstm_fp32_model"
         model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
         model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
         data_name = "quant_lstm_input_data"
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
-        algo = "KL"
+        algo = "avg"
         round_type = "round"
         quantizable_op_type = ["mul", "lstm"]
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        diff_threshold = 0.01
+        diff_threshold = 0.02
         infer_iterations = 100
         quant_iterations = 10
         self.run_test(model_name, model_url, model_md5, data_name, data_url,
@@ -270,44 +270,21 @@ def test_post_training_kl(self):
                       diff_threshold, infer_iterations, quant_iterations)
 
 
-class TestPostTrainingKLForMnistAdaround(TestPostTrainingQuantization):
-    def test_post_training_kl(self):
+class TestPostTrainingAvgForLSTMONNXFormat(TestPostTrainingQuantization):
+    def test_post_training_avg_onnx_format(self):
         model_name = "nlp_lstm_fp32_model"
         model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
         model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
         data_name = "quant_lstm_input_data"
         data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
         data_md5 = "add84c754e9b792fea1fbd728d134ab7"
-        algo = "KL"
-        round_type = "adaround"
-        quantizable_op_type = ["mul", "lstm"]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        diff_threshold = 0.01
-        infer_iterations = 100
-        quant_iterations = 10
-        self.run_test(model_name, model_url, model_md5, data_name, data_url,
-                      data_md5, algo, round_type, quantizable_op_type,
-                      is_full_quantize, is_use_cache_file, is_optimize_model,
-                      diff_threshold, infer_iterations, quant_iterations)
-
-
-class TestPostTrainingKLForMnistONNXFormat(TestPostTrainingQuantization):
-    def test_post_training_kl_onnx_format(self):
-        model_name = "nlp_lstm_fp32_model"
-        model_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/nlp_lstm_fp32_model.tar.gz"
-        model_md5 = "519b8eeac756e7b4b7bcb2868e880452"
-        data_name = "quant_lstm_input_data"
-        data_url = "https://paddle-inference-dist.cdn.bcebos.com/int8/unittest_model_data/quant_lstm_input_data.tar.gz"
-        data_md5 = "add84c754e9b792fea1fbd728d134ab7"
-        algo = "KL"
+        algo = "avg"
         round_type = "round"
         quantizable_op_type = ["mul", "lstm"]
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        diff_threshold = 0.01
+        diff_threshold = 0.02
         infer_iterations = 100
         quant_iterations = 10
         onnx_format = True
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index c219d2fbf89a9..4c3a758f0e36d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -117,7 +117,8 @@ def generate_quantized_model(self,
                                  is_optimize_model=False,
                                  batch_size=10,
                                  batch_nums=10,
-                                 onnx_format=False):
+                                 onnx_format=False,
+                                 skip_tensor_list=None):
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
@@ -136,6 +137,7 @@ def generate_quantized_model(self,
             is_full_quantize=is_full_quantize,
             optimize_model=is_optimize_model,
             onnx_format=onnx_format,
+            skip_tensor_list=skip_tensor_list,
             is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model_path)
@@ -154,7 +156,8 @@ def run_test(self,
                  batch_size=10,
                  infer_iterations=10,
                  quant_iterations=5,
-                 onnx_format=False):
+                 onnx_format=False,
+                 skip_tensor_list=None):
 
         origin_model_path = self.download_model(data_url, data_md5, model_name)
         origin_model_path = os.path.join(origin_model_path, model_name)
@@ -166,10 +169,10 @@ def run_test(self,
 
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model_name, quant_iterations * batch_size))
-        self.generate_quantized_model(origin_model_path, algo, round_type,
-                                      quantizable_op_type, is_full_quantize,
-                                      is_use_cache_file, is_optimize_model,
-                                      batch_size, quant_iterations, onnx_format)
+        self.generate_quantized_model(
+            origin_model_path, algo, round_type, quantizable_op_type,
+            is_full_quantize, is_use_cache_file, is_optimize_model, batch_size,
+            quant_iterations, onnx_format, skip_tensor_list)
 
         print("Start INT8 inference for {0} on {1} images ...".format(
             model_name, infer_iterations * batch_size))
@@ -338,6 +341,27 @@ def test_post_training_mse(self):
                       infer_iterations, quant_iterations)
 
 
+class TestPostTrainingKLAdaroundForMnist(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "KL"
+        round_type = "adaround"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, round_type,
+                      quantizable_op_type, is_full_quantize, is_use_cache_file,
+                      is_optimize_model, diff_threshold, batch_size,
+                      infer_iterations, quant_iterations)
+
+
 class TestPostTrainingmseForMnistONNXFormat(TestPostTrainingQuantization):
     def test_post_training_mse_onnx_format(self):
         model_name = "mnist_model"
@@ -405,5 +429,38 @@ def test_post_training_mse_onnx_format_full_quant(self):
             onnx_format=onnx_format)
 
 
+class TestPostTrainingavgForMnistSkipOP(TestPostTrainingQuantization):
+    def test_post_training_avg_skip_op(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "avg"
+        round_type = "round"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        skip_tensor_list = ["fc_0.w_0"]
+        self.run_test(
+            model_name,
+            data_url,
+            data_md5,
+            algo,
+            round_type,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_size,
+            infer_iterations,
+            quant_iterations,
+            skip_tensor_list=skip_tensor_list)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 498a1ec46cacd..629529ff1b965 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -383,7 +383,7 @@ def test_post_training_hist_mobilenetv1(self):
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = True
-        diff_threshold = 0.025
+        diff_threshold = 0.03
         self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
                       is_optimize_model, diff_threshold)
@@ -412,123 +412,6 @@ def test_post_training_abs_max_mobilenetv1(self):
                       is_optimize_model, diff_threshold)
 
 
-class TestPostTrainingAvgAdaRoundForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_adaround_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "avg"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTrainingAbsMaxAdaRoundForMobilenetv1(
-        TestPostTrainingQuantization):
-    def test_post_training_adaround_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "abs_max"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTraininghistAdaroundForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_hist_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "hist"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTrainingKLAdaroundForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_kl_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "KL"
-        round_type = "adaround"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-            "pool2d",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
-class TestPostTrainingEMDForMobilenetv1(TestPostTrainingQuantization):
-    def test_post_training_avg_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "emd"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = True
-        diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
-
-
 class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_onnx_format_mobilenetv1(self):
         model = "MobileNet-V1"
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 304f31c2b1629..6dc3813fa6d0c 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -22,7 +22,7 @@
 from .. import core
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
 from ..multiprocess_utils import _cleanup_mmap, CleanupFuncRegistrar, MP_STATUS_CHECK_INTERVAL
-from ..framework import _non_static_mode
+from ..framework import _non_static_mode, _in_eager_without_dygraph_check
 from .flat import _flatten_batch
 
 # NOTE: queue has a different name in python2 and python3
@@ -339,10 +339,16 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                     out_queue.put((idx, batch, None))
                 batch, structure = _flatten_batch(batch)
                 if use_shared_memory:
+                    # NOTE: In eager mode, Tensor._share_memory has no
+                    # effect, fall back to _array_to_share_memory_tensor
+                    def tensor_share_memory(tensor):
+                        if _in_eager_without_dygraph_check():
+                            return core._array_to_share_memory_tensor(tensor)
+                        return tensor._share_memory()
                     tensor_list = [
                         core._array_to_share_memory_tensor(b)
-                        if isinstance(b, np.ndarray) else b._share_memory()
-                        for b in batch
+                        if isinstance(b, np.ndarray) \
+                        else tensor_share_memory(b) for b in batch
                     ]
                     out_queue.put((idx, tensor_list, structure))
                     core._remove_tensor_list_mmap_fds(tensor_list)
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index f7d4be7ee6e3c..5da5dbbd7bdfc 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -276,9 +276,10 @@ def amp_guard(enable=True,
     if enable and not (tracer._expected_place.is_gpu_place() or
                        tracer._expected_place.is_xpu_place() or
                        tracer._expected_place.is_mlu_place() or
-                       tracer._expected_place.is_npu_place()):
+                       tracer._expected_place.is_npu_place() or
+                       tracer._expected_place.is_custom_place()):
         warnings.warn(
-            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.'
+            'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace, and CustomPlace, current place is %s, so it makes no effect.'
             % tracer._expected_place)
         enable = False
     # For npu:
@@ -293,6 +294,10 @@ def amp_guard(enable=True,
     if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'):
         warnings.warn('MLUPlace only support float16 amp.')
         enable = False
+    # For custom device:
+    if tracer._expected_place.is_custom_place() and (dtype == 'bfloat16'):
+        warnings.warn('CustomPlace only support float16 amp.')
+        enable = False
     # For gpu float16: Compute Capability should >= 7.
     # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11.
     if tracer._expected_place.is_gpu_place():
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index c57290861942b..df79b5ab5e482 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -107,9 +107,10 @@ def __init__(self,
         if enable and not (tracer._expected_place.is_gpu_place() or
                            tracer._expected_place.is_xpu_place() or
                            tracer._expected_place.is_mlu_place() or
-                           tracer._expected_place.is_npu_place()):
+                           tracer._expected_place.is_npu_place() or
+                           tracer._expected_place.is_custom_place()):
             warnings.warn(
-                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.'
+                'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace, NPUPlace and CustomPlace, current place is %s, so it makes no effect.'
                 % tracer._expected_place)
             enable = False
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 0670c048c5e26..60043c42121bd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -21,6 +21,10 @@
 from paddle.fluid import core
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
 from paddle.fluid.framework import Program
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
@@ -214,7 +218,7 @@ def ast_walk(transformed_node, static_node):
     def _as_list(x):
         if x is None:
             return []
-        return list(x) if isinstance(x, collections.Sequence) else [x]
+        return list(x) if isinstance(x, Sequence) else [x]
 
     transformed_node_list = _as_list(transformed_node)
     static_node_list = _as_list(static_node)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index bc1a0e30dd42d..b860740f71b25 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -196,10 +196,11 @@ def from_func_and_args(cls, function_spec, args, kwargs, class_instance):
 
     def __hash__(self):
         error_msg = "Arguments to a `@paddle.jit.to_static` must be a hashable Python objects (or nested structures of these types)."
+        with_hook = self.kwargs.get("with_hook", False)
         return hash((id(self.function_spec),
                      make_hashable(self.input_args_with_spec, error_msg),
                      make_hashable(self.input_kwargs_with_spec, error_msg),
-                     self._spec_names_id, self.class_instance))
+                     self._spec_names_id, self.class_instance, with_hook))
 
     def __eq__(self, other):
         return (type(self) is type(other)) and hash(self) == hash(other)
@@ -413,6 +414,8 @@ def get_concrete_program(self, *args, **kwargs):
             Traced ConcreteProgram and executable translated Layer.
         """
 
+        with_hook = kwargs.get("with_hook", False)
+        if "with_hook" in kwargs: kwargs.pop("with_hook")
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
             args, kwargs = self._function_spec.unified_args_and_kwargs(args,
@@ -421,9 +424,13 @@ def get_concrete_program(self, *args, **kwargs):
             args, kwargs)
 
         # 2. generate cache key
-        cache_key = CacheKey(self._function_spec, input_args_with_spec,
-                             input_kwargs_with_spec, self._class_instance,
-                             **self._kwargs)
+        cache_key = CacheKey(
+            self._function_spec,
+            input_args_with_spec,
+            input_kwargs_with_spec,
+            self._class_instance,
+            **self._kwargs,
+            with_hook=with_hook)
 
         # 3. check whether hit the cache or build a new program for the input arguments
         concrete_program, partial_program_layer = self._program_cache[cache_key]
@@ -480,11 +487,13 @@ def foo(x, y):
         """
         return self.concrete_program_specify_input_spec(input_spec=None)
 
-    def concrete_program_specify_input_spec(self, input_spec=None):
+    def concrete_program_specify_input_spec(self,
+                                            input_spec=None,
+                                            with_hook=False):
         """
         Returns recent ConcreteProgram instance of decorated function while
         specifying input_spec. If the self._function_spec already has
-        input_spce, it will check the compatibility of input input_spec and
+        input_spec, it will check the compatibility of input input_spec and
         the self._function_spec.input_spec. If input input_spec=None, then
         this method uses self._function_spec.input_spec
 
@@ -516,12 +525,18 @@ def concrete_program_specify_input_spec(self, input_spec=None):
             has_input_spec = (desired_input_spec is not None)
             if has_input_spec:
                 concrete_program, _ = self.get_concrete_program(
-                    *desired_input_spec)
+                    *desired_input_spec, with_hook=with_hook)
                 return concrete_program
             else:
                 raise ValueError(
                     "No valid transformed program for {}.\n\t    Please specific `input_spec` in `@paddle.jit.to_static` or feed input tensor to call the decorated function at once.\n".
                     format(self._function_spec))
+        elif with_hook:
+            cache_key = self._program_cache._recent_cache_key
+            cache_key.kwargs["with_hook"] = True
+            concrete_program, _ = self._program_cache[cache_key]
+            return concrete_program
+
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
             logging_utils.warn(
@@ -588,6 +603,54 @@ def _verify_init_in_dynamic_mode(class_instance):
                     class_instance))
 
 
+class HookHelper(object):
+    """
+    Only For converting pre/post hooks operation in outermost layer while jit.save.
+    Because hooks in sublayer have been processed automatically.
+    """
+
+    def __init__(self, func, class_instance, with_hook=False):
+        self.func = func
+        self.class_instance = class_instance
+        self.with_hook = with_hook
+        self.need_apply_hook = with_hook and isinstance(
+            self.class_instance,
+            layers.Layer) and getattr(func, "__name__") == "forward"
+
+    def apply_pre_hooks(self, inputs):
+        """
+        Apply _forward_pre_hooks from outermost layer
+        """
+        if not self.need_apply_hook: return inputs
+
+        inputs = inputs[1:]
+        for forward_pre_hook in self.class_instance._forward_pre_hooks.values():
+            hook_result = forward_pre_hook(self.class_instance, inputs)
+            if hook_result is not None:
+                if not isinstance(hook_result, tuple):
+                    hook_result = (hook_result, )
+                inputs = hook_result
+
+        return [self.class_instance] + list(inputs)
+
+    def apply_post_hooks(self, inputs, outputs):
+        """
+        Apply _forward_post_hooks from outermost layer
+        """
+        if not self.need_apply_hook: return outputs
+
+        inputs = inputs[1:]
+        for forward_post_hook in self.class_instance._forward_post_hooks.values(
+        ):
+            hook_result = forward_post_hook(self.class_instance, inputs,
+                                            outputs)
+            if hook_result is not None:
+                outputs = hook_result
+
+        inputs.insert(0, self.class_instance)
+        return outputs
+
+
 class ConcreteProgram(object):
 
     __slots__ = [
@@ -629,6 +692,9 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         # Transforms dygraph function into static function and caches it.
         dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
+        # apply pre\post hook for outermost layer
+        hook_helper = HookHelper(dygraph_function, class_instance,
+                                 kwargs.get("with_hook", False))
 
         main_program, startup_program = framework.Program(), framework.Program()
         # Note: The random seed should be synchronized into cached program
@@ -642,12 +708,13 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         with framework.program_guard(main_program, startup_program):
             with _switch_declarative_mode_guard_(is_declarative=True):
                 # 1. Adds `fluid.data` layers for input if needed
-                inputs = func_spec.to_static_inputs_with_spec(input_spec,
-                                                              main_program)
+                static_inputs = func_spec.to_static_inputs_with_spec(
+                    input_spec, main_program)
                 _kwargs = func_spec.to_static_inputs_with_spec(
                     input_kwargs_spec, main_program)
                 if class_instance:
-                    inputs = tuple([class_instance] + list(inputs))
+                    static_inputs = tuple([class_instance] + list(
+                        static_inputs))
 
                 # 2. Gets all ParamBases and buffered VarBases in the function
                 all_parameters_and_buffers = _extract_indeed_params_buffers(
@@ -658,10 +725,13 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
                         class_instance, False)), param_guard(
                             get_buffers(class_instance, False)):
                     try:
+                        # only for jit.save, do nothing while train and eval process
+                        inputs = hook_helper.apply_pre_hooks(static_inputs)
                         if _kwargs:
                             outputs = static_func(*inputs, **_kwargs)
                         else:
                             outputs = static_func(*inputs)
+                        outputs = hook_helper.apply_post_hooks(inputs, outputs)
                     except BaseException as e:
                         # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
                         error.attach_error_data(e)
@@ -679,7 +749,7 @@ def from_func_spec(func_spec, input_spec, input_kwargs_spec, class_instance,
         main_program = update_op_callstack_with_origin_info(main_program)
 
         return ConcreteProgram(
-            inputs=inputs,
+            inputs=static_inputs,
             outputs=outputs,
             parameters=all_parameters_and_buffers,
             function=dygraph_function,
@@ -709,6 +779,7 @@ def __init__(self):
         self._caches = collections.OrderedDict()
         # trace mostly recent used program 
         self._recent_key = None
+        self._recent_cache_key = None
 
     def _build_once(self, cache_key):
         concrete_program = ConcreteProgram.from_func_spec(
@@ -724,6 +795,7 @@ def __getitem__(self, item):
             raise ValueError('type(item) should be CacheKey, but received %s' %
                              type_name(item))
         item_id = hash(item)
+        self._recent_cache_key = item
         self._recent_key = item_id
         if item_id not in self._caches:
             self._caches[item_id] = self._build_once(item)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 7957b33bf1dce..e0e259215c509 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -302,6 +302,7 @@ def __init__(self):
 
         # If True, It will save inference program only, and do not save params of Program
         self._program_only = False
+        self.with_hook = False
 
     @property
     def output_spec(self):
@@ -370,7 +371,7 @@ def keep_name_table(self, value):
 
 
 def _parse_save_configs(configs):
-    supported_configs = ['output_spec']
+    supported_configs = ['output_spec', "with_hook"]
 
     # input check
     for key in configs:
@@ -382,6 +383,7 @@ def _parse_save_configs(configs):
     # construct inner config
     inner_config = _SaveLoadConfig()
     inner_config.output_spec = configs.get('output_spec', None)
+    inner_config.with_hook = configs.get('with_hook', False)
 
     return inner_config
 
@@ -454,11 +456,15 @@ def _get_input_var_names(inputs, input_spec):
     return result_list
 
 
-def _get_output_vars(outputs, output_spec):
+def _get_output_vars(outputs, output_spec, with_hook=False):
     name_no_exists_error = "The tensor `%s` does not exists. " \
         "Please make sure the name of example Tensor " \
         "in configs.output_spec is the output tensor of " \
         "Layer.forward method."
+    if output_spec and with_hook:
+        raise RuntimeError(
+            "Currently not support specify output_spec while founding pre/post hooks in your outermost layer."
+        )
     result_list = []
     output_vars_dict = OrderedDict()
     for var in flatten(outputs):
@@ -830,10 +836,16 @@ def fun(inputs):
 
     # parse configs
     configs = _parse_save_configs(configs)
+    # whether outermost layer has pre/post hook, if does, we need also save
+    # these operators in program. 
+    with_hook = configs.with_hook
+
     scope = core.Scope()
     extra_var_info = dict()
     if isinstance(layer, Layer):
         functions = dir(inner_layer)
+        if inner_layer._forward_pre_hooks or inner_layer._forward_post_hooks:
+            with_hook = True
     else:
         # layer is function
         functions = [layer, ]
@@ -842,7 +854,7 @@ def fun(inputs):
             static_func = getattr(inner_layer, attr_func, None)
             if isinstance(static_func, StaticFunction):
                 concrete_program = static_func.concrete_program_specify_input_spec(
-                    inner_input_spec)
+                    inner_input_spec, with_hook=with_hook)
             elif 'forward' == attr_func:
                 # transform in jit.save, if input_spec is incomplete, declarative will throw error
                 # inner_input_spec is list[InputSpec], it should be packed with same structure
@@ -852,7 +864,8 @@ def fun(inputs):
                                                         inner_input_spec)
                 static_forward = declarative(
                     inner_layer.forward, input_spec=inner_input_spec)
-                concrete_program = static_forward.concrete_program
+                concrete_program = static_forward.concrete_program_specify_input_spec(
+                    with_hook=with_hook)
                 # the input_spec has been used in declarative, which is equal to
                 # @declarative with input_spec and jit.save without input_spec,
                 # avoid needless warning
@@ -943,8 +956,10 @@ def fun(inputs):
         # the rule is like [ Get input variables name ]. For output var,
         # we only support VarBase spec, and actually, we only need the
         # var name of output, and we don't recommended to use output_spec
+        # print(concrete_program.main_program)
+        # print(concrete_program.outputs, configs.output_spec)
         output_vars = _get_output_vars(concrete_program.outputs,
-                                       configs.output_spec)
+                                       configs.output_spec, with_hook)
 
         # 5. save inference model
         from paddle.fluid.io import save_inference_model
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8b10a5f454e69..200e8feec1e6a 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6781,7 +6781,10 @@ def lod_append(x, level):
             x = fluid.layers.data(name='x', shape=[6, 10], lod_level=1)
             out = fluid.layers.lod_append(x, [1,1,1,1,1,1])
     """
-    from collections import Iterable
+    try:
+        from collections.abc import Iterable
+    except:
+        from collections import Iterable
     if x is None:
         raise ValueError("Input(x) can't be None.")
     if (not isinstance(level, Iterable)) and (not isinstance(level, Variable)):
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 1b9c87f1c0d06..707a1dc2cbc2f 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -33,6 +33,10 @@
 from ..framework import _non_static_mode
 from ..param_attr import ParamAttr
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 __all__ = [
     'RNNCell',
@@ -163,7 +167,7 @@ def _is_shape_sequence(seq):
             # TODO: Add check for the illegal
             if isinstance(seq, dict):
                 return True
-            return (isinstance(seq, collections.Sequence) and
+            return (isinstance(seq, Sequence) and
                     not isinstance(seq, six.string_types))
 
         class Shape(object):
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index c30f41f6a20d9..5d781a437fe8f 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -21,6 +21,10 @@
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..layer_helper import LayerHelper
 from sys import version_info
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 
 def convert_to_list(value, n, name, dtype=int):
@@ -74,8 +78,7 @@ def is_sequence(seq):
     """
     if isinstance(seq, dict):
         return True
-    return (isinstance(seq, collections.Sequence) and
-            not isinstance(seq, six.string_types))
+    return (isinstance(seq, Sequence) and not isinstance(seq, six.string_types))
 
 
 def _hash_with_id(*args):
@@ -148,7 +151,7 @@ def _sequence_like(instance, args):
         return type(instance)((key, result[key])
                               for key in six.iterkeys(instance))
     elif (isinstance(instance, tuple) and hasattr(instance, "_fields") and
-          isinstance(instance._fields, collections.Sequence) and
+          isinstance(instance._fields, Sequence) and
           all(isinstance(f, six.string_types) for f in instance._fields)):
         # This is a namedtuple
         return type(instance)(*args)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 12ed7b975af0c..08e24f86a29a4 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -914,6 +914,7 @@ set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inp
         test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
         test_distributed_fused_lamb_op_with_clip
         test_distributed_fused_lamb_op_without_clip
+        test_distributed_fused_lamb_op_with_gradient_merge
         test_parallel_executor_fetch_isolated_var
         PROPERTIES LABELS "RUN_TYPE=DIST")
 
@@ -1047,6 +1048,7 @@ set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_with_clip PROPERTIES TIMEOUT 120)
 set_tests_properties(test_distributed_fused_lamb_op_without_clip PROPERTIES TIMEOUT 120)
+set_tests_properties(test_distributed_fused_lamb_op_with_gradient_merge PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 300)
@@ -1233,6 +1235,7 @@ set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
 set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
 set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
+set_tests_properties(test_tensordot PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=False")
 set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
 if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 97a3092f11fd2..4d052f7e90cd3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -12,6 +12,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS})
     py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS})
     set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+    py_test_modules(test_high_order_grad MODULES test_high_order_grad ENVS ${dist_ENVS})
+    set_tests_properties(test_high_order_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
 
     py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS})
     py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index d7321066ed9d9..b039bb76dcb03 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -127,9 +127,16 @@ def train():
     engine.prepare(optimizer, loss)
     engine.fit(dataset,
                batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size)
-    engine.save('./mlp')
-    engine.load('./mlp')
+               steps_per_epoch=batch_num * batch_size,
+               sample_generator=True)
+
+    eval_dataset = MyDataset(batch_size)
+    engine.prepare(optimizer, loss, mode='eval')
+    engine.evaluate(eval_dataset, batch_size)
+
+    test_dataset = MyDataset(batch_size)
+    engine.prepare(mode='predict')
+    engine.predict(test_dataset, batch_size)
     engine.save('./mlp_inf', training=False, mode='predict')
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py
deleted file mode 100644
index 5f7c018ee4f16..0000000000000
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_predict_api.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import time
-import paddle.fluid as fluid
-import copy
-import os
-import numpy as np
-import subprocess
-import paddle
-import paddle.nn as nn
-import paddle.fluid as fluid
-import paddle.static as static
-import paddle.nn.functional as F
-import paddle.utils as utils
-from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
-
-paddle.enable_static()
-global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
-batch_size = 1
-batch_num = 10
-hidden_size = 1024
-image_size = hidden_size
-
-paddle.seed(44)
-
-
-class MyDataset(Dataset):
-    def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
-        self.num_samples = num_samples
-
-    def __getitem__(self, index):
-        input = np.random.uniform(size=image_size).astype("float32")
-        return input
-
-    def __len__(self):
-        return self.num_samples
-
-
-class MLPLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
-        super(MLPLayer, self).__init__()
-        d_model = hidden_size
-        dim_feedforward = intermediate_size
-        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
-            mean=0.0, std=initializer_range))
-        bias_attr = None
-
-        self.linear0 = nn.Linear(
-            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
-        self.linear1 = nn.Linear(
-            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
-        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
-        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
-        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
-
-    def forward(self, input):
-        out = self.norm(input)
-        out = self.linear0(input)
-        auto.shard_tensor(
-            self.linear0.weight,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mapping": [-1, 0]
-            })
-        out = F.gelu(out, approximate=True)
-        out = self.linear1(out)
-        auto.shard_tensor(
-            self.linear1.weight,
-            dist_attr={
-                "process_mesh": global_process_mesh,
-                "dims_mapping": [0, -1]
-            })
-        out = self.dropout(out)
-        out = self.linear2(out)
-        return out
-
-
-def train():
-    mlp = MLPLayer(
-        hidden_size=hidden_size,
-        intermediate_size=4 * hidden_size,
-        dropout_ratio=0.1,
-        initializer_range=0.02)
-
-    dataset = MyDataset(batch_num * batch_size)
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-
-    dist_strategy = fleet.DistributedStrategy()
-    # init parallel optimizer
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    engine = Engine(mlp, inputs_spec=inputs_spec, strategy=dist_strategy)
-    engine.prepare(mode='predict')
-    engine.predict(dataset, batch_size=batch_size)
-
-
-if __name__ == "__main__":
-    train()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
new file mode 100644
index 0000000000000..9a9efe7ab2dd0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import paddle
+import unittest
+import numpy as np
+import paddle.distributed.auto_parallel as auto
+
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+from paddle.incubate.autograd import Hessian
+from paddle.distributed.auto_parallel.engine import Engine
+
+
+class FCNet:
+    def __init__(self, num_ins, num_outs, num_layers, hidden_size):
+        self.num_ins = num_ins
+        self.num_outs = num_outs
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        self.activation = paddle.tanh
+
+        self.weights = []
+        self.biases = []
+        for i in range(self.num_layers):
+            if i == 0:
+                lsize = self.num_ins
+                rsize = self.hidden_size
+            elif i == (self.num_layers - 1):
+                lsize = self.hidden_size
+                rsize = self.num_outs
+            else:
+                lsize = self.hidden_size
+                rsize = self.hidden_size
+
+            w = paddle.static.create_parameter(
+                shape=[lsize, rsize], dtype="float32", is_bias=False)
+            b = paddle.static.create_parameter(
+                shape=[rsize], dtype="float32", is_bias=True)
+            self.weights.append(w)
+            self.biases.append(b)
+
+    def nn_func(self, ins):
+        u = ins
+        for i in range(self.num_layers - 1):
+            u = paddle.nn.functional.linear(u, self.weights[i], self.biases[i])
+            u = self.activation(u)
+        u = paddle.nn.functional.linear(u, self.weights[-1], self.biases[-1])
+        return u
+
+
+class LaplaceModel(paddle.nn.Layer):
+    def __init__(self, num_ins=2, num_outs=1, num_layers=5, hidden_size=20):
+        super(LaplaceModel, self).__init__()
+        self.net = FCNet(
+            num_ins=num_ins,
+            num_outs=num_outs,
+            num_layers=num_layers,
+            hidden_size=hidden_size)
+
+    def forward(self, inputs, bc_index):
+        inputs.stop_gradient = False
+        outputs = self.net.nn_func(inputs)
+        # eq_loss
+        hes = Hessian(self.net.nn_func, inputs, is_batched=True)
+        eq_loss = paddle.norm(hes[:, 0, 0] + hes[:, 1, 1], p=2)
+        # bc_loss
+        bc_u = paddle.index_select(outputs, bc_index)
+        return eq_loss, bc_u
+
+
+class LaplaceDataset:
+    def __init__(self, num_sample):
+        self.num_sample = num_sample
+
+    def __getitem__(self, index):
+        x = np.linspace(0, 0.9, 10)
+        y = np.linspace(0, 0.9, 10)
+        bc_value = np.random.rand(36).reshape(36, 1).astype('float32')
+
+        domain_space = []
+        bc_index = []
+        for j in range(len(y)):
+            for i in range(len(x)):
+                domain_space.append([x[i], y[j]])
+                if i == 0 or i == 9 or j == 0 or j == 9:
+                    bc_index.append(i + 10 * j)
+        domain_space = np.array(domain_space, dtype='float32')
+        bc_index = np.array(bc_index, dtype='int64')
+
+        return domain_space, bc_index, bc_value
+
+    def __len__(self):
+        return self.num_sample
+
+
+def loss_func(eq_loss, bc_u, bc_value):
+    bc_diff = bc_u - bc_value
+    bc_loss = paddle.norm(bc_diff, p=2)
+    loss = eq_loss + bc_loss
+    return loss
+
+
+def main():
+    # dataset
+    train_dataset = LaplaceDataset(10)
+    # optimizer
+    optimizer = paddle.optimizer.Adam(learning_rate=0.001)
+    # model
+    laplace = LaplaceModel()
+
+    # spec
+    inputs_spec = [
+        InputSpec([100, 2], 'float32', 'x'), InputSpec([36], 'int64', 'bc_idx')
+    ]
+    labels_spec = InputSpec([36, 1], 'float32', 'bc_v')
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    engine = Engine(
+        laplace,
+        inputs_spec=inputs_spec,
+        labels_spec=labels_spec,
+        strategy=dist_strategy)
+    paddle.seed(1234 + engine._cur_rank)
+    engine.prepare(optimizer=optimizer, loss=loss_func)
+    res = engine.fit(train_dataset, sample_generator=False)
+    assert np.allclose(res[-1], 2.840593)
+
+    dist_context = engine.dist_context
+    block = engine.main_program.global_block()
+    ops = block.ops
+    for op in ops:
+        if op.type == 'p_norm':
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            assert op_dist_attr.impl_type == 'p_norm'
+        if 'x' in op.input_arg_names:
+            out_name = op.output_arg_names[0]
+            assert block.vars[out_name].shape[0] == 50
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index 5ca12bc1e0e17..efcad7eb11268 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -49,28 +49,6 @@ def test_engine_api(self):
         if os.path.exists('rank_mapping.csv'):
             os.remove('rank_mapping.csv')
 
-    def test_engine_predict(self):
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        launch_model_path = os.path.join(file_dir, "engine_predict_api.py")
-
-        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
-            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
-        else:
-            coverage_args = []
-
-        cmd = [sys.executable, "-u"] + coverage_args + [
-            "-m", "launch", "--gpus", "0,1", launch_model_path
-        ]
-
-        process = subprocess.Popen(cmd)
-        process.wait()
-        self.assertEqual(process.returncode, 0)
-
-        # Remove unnecessary files
-        log_path = os.path.join(file_dir, "log")
-        if os.path.exists(log_path):
-            shutil.rmtree(log_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
new file mode 100644
index 0000000000000..ab4a34cf99cbf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_high_order_grad.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestHighOrderGrad(unittest.TestCase):
+    def test_dp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "high_order_grad.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "launch", "--gpus", "0,1", launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        # Remove unnecessary files
+        log_path = os.path.join(file_dir, "log")
+        if os.path.exists(log_path):
+            shutil.rmtree(log_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index 00d2a1f71d6bd..0af7d40a2f02e 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -149,6 +149,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             kwargs['exclude_from_weight_decay_fn'] = exclude_fn
             kwargs['lamb_weight_decay'] = 0.1
 
+            gm_steps = kwargs['gradient_accumulation_steps']
             if use_distributed_lamb:
                 optimizer_class = DistributedFusedLamb
                 kwargs = dict(kwargs)
@@ -163,6 +164,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                 )
                 kwargs['grad_clip'] = GradClipDecorator(base_clip,
                                                         clip_after_allreduce)
+                kwargs.pop('gradient_accumulation_steps', None)
 
             optimizer = optimizer_class(**kwargs)
             get_parameter = optimizer._get_parameter
@@ -173,6 +175,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             if use_fp16:
                 if not use_distributed_lamb:
                     optimizer._multi_precision = True
+
                 optimizer = paddle.static.amp.decorate(
                     optimizer,
                     amp_list,
@@ -180,6 +183,13 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                     use_dynamic_loss_scaling=False,
                     use_pure_fp16=use_fp16,
                     use_fp16_guard=use_fp16)
+                amp_init = optimizer.amp_init
+            else:
+                amp_init = None
+
+            if gm_steps > 1 and not use_distributed_lamb:
+                optimizer = paddle.fluid.optimizer.GradientMergeOptimizer(
+                    optimizer, k_steps=gm_steps, avg=False)
 
             params_grads = optimizer.backward(loss, startup)
             op_num = len(main.global_block().ops)
@@ -211,7 +221,7 @@ def gen_random_grad_tensor(grad):
         return grad_t
 
     def reader():
-        for _ in range(5):
+        for _ in range(6):
             yield dict(
                 [(grad.name, gen_random_grad_tensor(grad)) for grad in grads])
 
@@ -223,8 +233,8 @@ def reader():
         place = paddle.CUDAPlace(dev_id)
         exe = paddle.static.Executor(place)
         exe.run(startup)
-        if use_fp16:
-            optimizer.amp_init(place)
+        if amp_init is not None:
+            amp_init(place)
 
         master_p_ts = []
         for p in params:
@@ -258,10 +268,12 @@ def config(self):
             distutils.util.strtobool(
                 os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
         max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0))
+        gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
         print('clip_after_allreduce = {}, max_global_norm = {}'.format(
             clip_after_allreduce, max_global_norm))
         return {
             'clip_after_allreduce': clip_after_allreduce,
+            'gradient_accumulation_steps': gm_steps,
             'grad_clip': paddle.nn.ClipGradByGlobalNorm(max_global_norm)
             if max_global_norm > 0 else None,
         }
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
new file mode 100644
index 0000000000000..dcb41cfc6aba7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_layer_hook.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+
+import numpy as np
+
+
+def forward_post_hook1(layer, input, output):
+    return output + output
+
+
+def forward_pre_hook1(layer, input):
+    input_return = (input[0] * 2, )
+    return input_return
+
+
+class SimpleNet(paddle.nn.Layer):
+    def __init__(self, ):
+        super(SimpleNet, self).__init__()
+        self.fc1 = paddle.nn.Linear(10, 10)
+        # sublayer1 register post hook
+        self.fc1.register_forward_post_hook(forward_post_hook1)
+
+        self.fc2 = paddle.nn.Linear(10, 10)
+        # sublayer2 register pre hook
+        self.fc2.register_forward_pre_hook(forward_pre_hook1)
+
+        # register pre/post hook
+        self.register_forward_pre_hook(forward_pre_hook1)
+        self.register_forward_post_hook(forward_post_hook1)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.fc2(x)
+        out = paddle.mean(x)
+
+        return out
+
+
+class TestNestLayerHook(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        self.x = paddle.randn([4, 10])
+        self.path = "./net_hook"
+
+    def train_net(self, to_static=False):
+        paddle.seed(2022)
+        net = SimpleNet()
+        if to_static:
+            net = paddle.jit.to_static(net)
+        out = net(self.x)
+
+        if to_static:
+            paddle.jit.save(net, self.path)
+
+        return out.numpy()[0]
+
+    def load_train(self):
+        net = paddle.jit.load(self.path)
+        out = net(self.x)
+        return out.numpy()[0]
+
+    def test_hook(self):
+        dy_out = self.train_net(to_static=False)
+        st_out = self.train_net(to_static=True)
+        load_out = self.load_train()
+        print(st_out, dy_out, load_out)
+        self.assertTrue(
+            np.allclose(st_out, dy_out),
+            msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
+        self.assertTrue(
+            np.allclose(st_out, load_out),
+            msg='load_out is {}\nstatic_res is {}'.format(load_out, st_out))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
index ab836b088b09f..872d419ff8928 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
@@ -20,9 +20,6 @@
 
 from simnet_dygraph_model_v2 import BOW, HingeLoss
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 SEED = 102
 random.seed(SEED)
 
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index 569d994b831b6..defbffe8f2020 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -27,6 +27,10 @@
 from paddle.fluid.executor import Executor
 from paddle.fluid.backward import _append_grad_suffix_, _as_list
 from paddle.fluid.framework import _test_eager_guard
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 
 def _product(t):
@@ -91,7 +95,7 @@ def var_to_np_array_in_scope(scope, place, name):
 def make_jacobian(x, y_size, np_dtype):
     if isinstance(x, fluid.framework.Variable):
         return np.zeros((_product(x.shape), y_size), dtype=np_dtype)
-    elif isinstance(x, collections.Sequence):
+    elif isinstance(x, Sequence):
         jacobians = list(
             filter(lambda t: t is not None, (make_jacobian(
                 item, y_size, np_dtype) for item in x)))
@@ -556,7 +560,10 @@ def get_static_double_grad(x,
     # so, they are also the input of second-order backward.
     x += y_grads
     x_init += dy_init
-    y = dx
+
+    # filter None in dx for DX/DY may be None in kernel
+    filted_dx = [dxi for dxi in dx if dxi is not None]
+    y = filted_dx
 
     # check input arguments
     x = _as_list(x)
@@ -615,6 +622,7 @@ def get_static_double_grad(x,
 def get_eager_double_grad(func,
                           x_init=None,
                           dy_init=None,
+                          place=None,
                           return_mid_result=False):
     """
     Get Double Grad result of dygraph.
@@ -623,6 +631,7 @@ def get_eager_double_grad(func,
         func: A wrapped dygraph function that its logic is equal to static program
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (bool): A flag that controls the return content.
     Returns:
         If 'return_mid_result' set True. 
@@ -631,6 +640,10 @@ def get_eager_double_grad(func,
         If 'return_mid_result' set False. 
         A list of numpy array that stores second derivative result calulated by dygraph.
     """
+    if isinstance(place, fluid.CPUPlace):
+        paddle.set_device("cpu")
+    if isinstance(place, fluid.CUDAPlace):
+        paddle.set_device("gpu")
     inputs = []
     dys = []
     for x in x_init:
@@ -644,7 +657,12 @@ def get_eager_double_grad(func,
     # calculate first derivative
     outputs = func(inputs)
     d_inputs = paddle.grad(
-        outputs=outputs, inputs=inputs, grad_outputs=dys, create_graph=True)
+        outputs=outputs,
+        inputs=inputs,
+        grad_outputs=dys,
+        create_graph=True,
+        allow_unused=True)
+    d_inputs = [d_input for d_input in d_inputs if d_input is not None]
 
     # calcluate second derivative
     inputs = inputs + dys
@@ -659,15 +677,20 @@ def get_eager_double_grad(func,
         ddy = paddle.ones(shape=d_input.shape, dtype=d_input.dtype)
         ddy.stop_gradient = False
         ddys.append(ddy)
+
     dd_inputs = paddle.grad(
         outputs=d_inputs,
         inputs=inputs,
         grad_outputs=ddys,
-        create_graph=create_graph)
+        create_graph=create_graph,
+        allow_unused=True)
+
     if return_mid_result:
         return dd_inputs, inputs + ddys
     else:
-        return [dd_input.numpy() for dd_input in dd_inputs]
+        return [
+            dd_input.numpy() for dd_input in dd_inputs if dd_input is not None
+        ]
 
 
 def double_grad_check_for_dygraph(func,
@@ -689,7 +712,6 @@ def double_grad_check_for_dygraph(func,
         y (Variable|list[Variable]): output variables to the program.
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        eps (float): perturbation for finite differences.
         atol (float): absolute tolerance.
         rtol (float): relative tolerance.
         raise_exception (bool): whether to raise an exception if
@@ -718,19 +740,25 @@ def fail_test(msg):
 
     paddle.disable_static()
     with _test_eager_guard():
-        eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init)
+        eager_double_grad = get_eager_double_grad(func, x_init, y_grads_init,
+                                                  place)
     paddle.enable_static()
 
     static_double_grad = get_static_double_grad(x, y, x_init, y_grads_init,
                                                 place)
 
+    if len(static_double_grad) != len(eager_double_grad):
+        msg = "The output grad tensor's number of static graph is different with dygraph, " \
+            "please check the python api unit test used."
+        raise RuntimeError(msg)
+
     for i in six.moves.xrange(len(static_double_grad)):
         if not np.allclose(static_double_grad[i], eager_double_grad[i], rtol,
                            atol):
-            msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \
-                'and eager double grad %s on %s,\n' \
+            msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \
+                'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \
                 'static:%s\n eager:%s\n' \
-                % (static_double_grad[i].name, eager_double_grad[i].name, str(place), static_double_grad[i], eager_double_grad[i])
+                % (str(place), i, static_double_grad[i], eager_double_grad[i])
             return fail_test(msg)
 
 
@@ -790,6 +818,7 @@ def get_static_triple_grad(x,
 def get_eager_triple_grad(func,
                           x_init=None,
                           dy_init=None,
+                          place=None,
                           return_mid_result=False):
     """
     Get triple Grad result of dygraph.
@@ -798,12 +827,13 @@ def get_eager_triple_grad(func,
         func: A wrapped dygraph function that its logic is equal to static program
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         dy_init (numpy.array|list[numpy.array]|None): the init value for gradient of output.
+        place (fluid.CPUPlace or fluid.CUDAPlace): the device.
         return_mid_result (list[Tensor], list[Tensor]): If set True, the 
     Returns:
         A list of numpy array that stores second derivative result calulated by dygraph
     """
     dd_y, dd_x = get_eager_double_grad(
-        func, x_init, dy_init, return_mid_result=True)
+        func, x_init, dy_init, place, return_mid_result=True)
 
     # calcluate third derivative
     dddys = []
@@ -835,7 +865,6 @@ def triple_grad_check_for_dygraph(func,
         y (Variable|list[Variable]): output variables to the program.
         x_init (numpy.array|list[numpy.array]|None): the init value for input x.
         place (fluid.CPUPlace or fluid.CUDAPlace): the device.
-        eps (float): perturbation for finite differences.
         atol (float): absolute tolerance.
         rtol (float): relative tolerance.
         raise_exception (bool): whether to raise an exception if
@@ -864,17 +893,23 @@ def fail_test(msg):
 
     paddle.disable_static()
     with _test_eager_guard():
-        eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init)
+        eager_triple_grad = get_eager_triple_grad(func, x_init, y_grads_init,
+                                                  place)
     paddle.enable_static()
 
     static_triple_grad = get_static_triple_grad(x, y, x_init, y_grads_init,
                                                 place)
 
+    if len(static_triple_grad) != len(eager_triple_grad):
+        msg = "The output grad tensor's number of static graph is different with dygraph, " \
+            "please check the python api unit test used."
+        raise RuntimeError(msg)
+
     for i in six.moves.xrange(len(static_triple_grad)):
         if not np.allclose(static_triple_grad[i], eager_triple_grad[i], rtol,
                            atol):
-            msg = 'Check eager double result fail. Mismatch between static_graph double grad %s ' \
-                'and eager double grad %s on %s,\n' \
+            msg = 'Check eager double result fail. Mismatch between static_graph double grad ' \
+                'and eager double grad on %s, the output double grad tensor\'s index is : %d \n' \
                 'static:%s\n eager:%s\n' \
-                % (static_triple_grad[i].name, eager_triple_grad[i].name, str(place), static_triple_grad[i], eager_triple_grad[i])
+                % (str(place), i, static_triple_grad[i], eager_triple_grad[i])
             return fail_test(msg)
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 79a2430a16170..4826b37512614 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -11,4 +11,5 @@ if(WITH_IPU)
     set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
+    set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 26fd42be6cd27..2583d9409a0a7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -15,9 +15,10 @@
 import os
 import random
 import unittest
-import numpy as np
 from enum import IntEnum
+from typing import Dict, List, Optional
 
+import numpy as np
 import paddle
 import paddle.static
 
@@ -33,31 +34,27 @@
 }
 
 
+def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
+    return map_np_dtype_to_fluid_dtype[dtype.name]
+
+
 class ExecutionModeFull(IntEnum):
     # Run fp32 model on cpu
     CPU_FP32 = 1
     # Run fp32 model on ipu
     IPU_FP32 = 2
-    # Convert model to fp16 using popart transform
+    # Convert model to fp16 using mixed-precision approch
     # All parameters will be converted to fp16
-    # TODO rename to IPU_FP16
-    IPU_POPART_FP16 = 3
-    # Mix-precision mode, using `paddle.static.amp.fp16_guard()` to control the
-    # precision of each operator
-    IPU_MIXED_PRECISION = 4
+    IPU_FP16 = 3
 
 
 class ExecutionMode(IntEnum):
     CPU_FP32 = ExecutionModeFull.CPU_FP32
     IPU_FP32 = ExecutionModeFull.IPU_FP32
-    IPU_POPART_FP16 = ExecutionModeFull.IPU_POPART_FP16
-
+    IPU_FP16 = ExecutionModeFull.IPU_FP16
 
-def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
-    return map_np_dtype_to_fluid_dtype[dtype.name]
 
-
-class IPUOpTest(unittest.TestCase):
+class IPUTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         # Get random seeds
@@ -67,12 +64,7 @@ def setUpClass(cls):
         cls.SEED = 2021
         np.random.seed(cls.SEED)
         random.seed(cls.SEED)
-
-        # For ipu, most ops support fp16
-        cls.amp_list = paddle.static.amp.CustomOpLists(
-            custom_black_list=[], custom_white_list=[])
-        cls.amp_list.unsupported_list = {}
-        cls.amp_list.black_list = {}
+        paddle.seed(cls.SEED)
 
         # Enable paddle static graph mode
         paddle.enable_static()
@@ -83,6 +75,7 @@ def tearDownClass(cls):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    # Check if ipumodel mode is enabled
     @classmethod
     def use_ipumodel(cls):
         if 'POPLAR_IPUMODEL' not in os.environ:
@@ -92,6 +85,69 @@ def use_ipumodel(cls):
             if flag.upper() in ['1', "TRUE"]:
                 return True
 
+    # Decorator for static graph building
+    def static_graph(builder):
+        def wrapper(self, *args, **kwargs):
+            self.scope = paddle.static.Scope()
+            self.main_prog = paddle.static.Program()
+            self.startup_prog = paddle.static.Program()
+            self.main_prog.random_seed = self.SEED
+            self.startup_prog.random_seed = self.SEED
+            with paddle.static.scope_guard(self.scope):
+                with paddle.utils.unique_name.guard(
+                        paddle.utils.unique_name.generate('')):
+                    with paddle.static.program_guard(self.main_prog,
+                                                     self.startup_prog):
+                        builder(self, *args, **kwargs)
+
+        return wrapper
+
+    # Cast a fp32 model to a full-fp16 model
+    @classmethod
+    def cast_model_to_fp16(cls, main_program):
+        amp_list = paddle.static.amp.CustomOpLists()
+        amp_list.unsupported_list = {}
+        to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+            main_program, amp_list, use_fp16_guard=False)
+        paddle.static.amp.cast_parameters_to_fp16(
+            paddle.CPUPlace(),
+            main_program,
+            to_fp16_var_names=to_fp16_var_names)
+
+
+class IPUOpTest(IPUTest):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+
+        # Items that a op_tester needs
+        cls.main_prog: paddle.static.Program = None
+        cls.startup_prog: paddle.static.Program = None
+        cls.scope: paddle.static.Scope = None
+        cls.feed_list: List[str] = None
+        cls.fetch_list: List[str] = None
+        cls.output_dict: Optional[Dict] = {}
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def skip_mode(self, exec_mode):
+        if exec_mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+            return True
+        else:
+            return False
+
+    def is_ipu_mode(self, exec_mode):
+        if exec_mode == ExecutionMode.CPU_FP32:
+            return False
+        return True
+
+    def is_fp16_mode(self, exec_mode):
+        if exec_mode != ExecutionMode.IPU_FP16:
+            return False
+        return True
+
     def set_atol(self):
         self.atol = 1e-10
         self.rtol = 1e-6
@@ -102,55 +158,90 @@ def set_training(self):
         self.is_training = False
         self.epoch = 1
 
-    def check(self, outputs, check_shape=False):
-        cpu_fp32 = outputs[ExecutionMode.CPU_FP32]
-        ipu_fp32 = outputs[ExecutionMode.IPU_FP32]
-        max_diff = np.abs(cpu_fp32 - ipu_fp32).max()
-        fp32_flag = np.allclose(
-            cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol)
-        self.assertTrue(fp32_flag, "max diff is %f" % (max_diff))
+    def run_op_test(self, exec_mode, ipu_strategy=None):
+        # NOTE: some op has no inputs
+        # if len(self.feed_list) == 0 or len(self.fetch_list) == 0:
+        #     raise ValueError('feed_list or fetch_list is empty')
+        if self.is_ipu_mode(exec_mode):
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if self.is_ipu_mode(exec_mode):
+            if ipu_strategy is None:
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+            if self.is_fp16_mode(exec_mode):
+                ipu_strategy.set_precision_config(enable_fp16=True)
+                IPUOpTest.cast_model_to_fp16(self.main_prog)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        feed = self.feed_fp32
+        if self.is_fp16_mode(exec_mode):
+            feed = self.feed_fp16
+
+        if self.is_training:
+            result = []
+            for _ in range(self.epoch):
+                loss_res = exe.run(program,
+                                   feed=feed,
+                                   fetch_list=self.fetch_list)
+                result.append(loss_res)
+        else:
+            result = exe.run(program, feed=feed, fetch_list=self.fetch_list)
+
+        if isinstance(result, list) and len(result) == 1:
+            self.output_dict[exec_mode] = result[0]
+        else:
+            self.output_dict[exec_mode] = result
+
+    def check(self, check_shape=False, output_dict=None):
+        if output_dict is None:
+            output_dict = self.output_dict
+        if len(output_dict) == 0:
+            raise ValueError("output_dict is empty")
+        cpu_fp32 = output_dict[ExecutionMode.CPU_FP32]
+        ipu_fp32 = output_dict[ExecutionMode.IPU_FP32]
+        cpu_fp32 = np.asarray(cpu_fp32).astype(np.float32).flatten()
+        ipu_fp32 = np.asarray(ipu_fp32).astype(np.float32).flatten()
+        pass_check = np.allclose(
+            ipu_fp32, cpu_fp32, rtol=self.rtol, atol=self.atol)
+        if not pass_check:
+            max_atol = np.abs(ipu_fp32 - cpu_fp32).max()
+            cpu_fp32_abs = np.abs(cpu_fp32)
+            cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
+            max_rtol = (np.abs(ipu_fp32 - cpu_fp32) / cpu_fp32_abs).max()
+            raise AssertionError(
+                f"ipu_fp32 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
+            )
 
         if check_shape:
             self.assertTrue(cpu_fp32.shape == ipu_fp32.shape)
 
-        ipu_popart_fp16 = None
-        if ExecutionMode.IPU_POPART_FP16 in outputs.keys():
-            ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16]
-            max_diff = np.abs(ipu_popart_fp16.astype(np.float32) -
-                              cpu_fp32).max()
-            fp16_flag = np.allclose(
-                ipu_popart_fp16.astype(np.float32),
-                cpu_fp32,
-                rtol=self.rtol_fp16,
-                atol=self.atol_fp16)
-            self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
+        if ExecutionMode.IPU_FP16 in output_dict.keys():
+            ipu_fp16 = output_dict[ExecutionMode.IPU_FP16]
+            ipu_fp16 = np.asarray(ipu_fp16).astype(np.float32).flatten()
+            pass_check = np.allclose(
+                ipu_fp16, cpu_fp32, rtol=self.rtol_fp16, atol=self.atol_fp16)
+            if not pass_check:
+                max_atol = np.abs(ipu_fp16 - cpu_fp32).max()
+                cpu_fp32_abs = np.abs(cpu_fp32)
+                cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20
+                max_rtol = (np.abs(ipu_fp16 - cpu_fp32) / cpu_fp32_abs).max()
+                raise AssertionError(
+                    f"ipu_fp16 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}"
+                )
 
             if check_shape:
-                self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape)
-
-            ipu_mixed_precision = None
-            if ExecutionModeFull.IPU_MIXED_PRECISION in outputs.keys():
-                ipu_mixed_precision = outputs[
-                    ExecutionModeFull.IPU_MIXED_PRECISION]
-                max_diff = np.abs(
-                    ipu_mixed_precision.astype(np.float32) - cpu_fp32).max()
-                fp16_flag = np.allclose(
-                    ipu_mixed_precision.astype(np.float32),
-                    cpu_fp32,
-                    rtol=self.rtol_fp16,
-                    atol=self.atol_fp16)
-                self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
-
-                if check_shape:
-                    self.assertTrue(ipu_mixed_precision.shape == cpu_fp32.shape)
-
-            if ExecutionMode.IPU_POPART_FP16 in outputs.keys(
-            ) and ExecutionModeFull.IPU_MIXED_PRECISION in outputs.keys():
-                max_diff = np.abs(ipu_popart_fp16 - ipu_mixed_precision).max()
-                self.assertEqual(ipu_popart_fp16.all(),
-                                 ipu_mixed_precision.all(),
-                                 "max diff is %f" % (max_diff))
-
-                if check_shape:
-                    self.assertTrue(
-                        ipu_popart_fp16.shape == ipu_mixed_precision.shape)
+                self.assertTrue(ipu_fp16.shape == cpu_fp32.shape)
+
+    # Execution Mode
+    class ExecutionMode(IntEnum):
+        CPU_FP32 = ExecutionModeFull.CPU_FP32
+        IPU_FP32 = ExecutionModeFull.IPU_FP32
+        IPU_FP16 = ExecutionModeFull.IPU_FP16
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
index 138365b650f24..b90c3374db96e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
@@ -18,8 +18,7 @@
 import paddle
 import paddle.nn.functional as F
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,10 +31,6 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.relu
         self.op_attrs = {}
@@ -49,60 +44,22 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = self.op(x, **self.op_attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = self.op(x, **self.op_attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTanh(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
index d14eba98ef5d7..c48ce75ccd9f3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,12 +30,8 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
-        data = np.random.uniform(size=[10, 1000])
+        data = np.random.uniform(size=[10, 500]).astype(np.float16)
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
         self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
@@ -48,64 +43,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.argmax(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0].astype(np.int32)
-
-    def test_base(self):
-        output_dict_fp32 = {}
-        output_dict_fp16 = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-
-            if mode > ExecutionMode.IPU_FP32:
-                output_dict_fp16[mode] = self._test_base(mode).flatten()
-            else:
-                output_dict_fp32[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict_fp32)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.argmax(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        for k, v in self.output_dict.items():
+            self.output_dict[k] = v.astype(np.int32)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
index 35f4ca17d5eba..1239a97f2f653 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,10 +29,6 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -42,60 +38,23 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                assign = paddle.assign(x)
-                out = paddle.fluid.layers.elementwise_add(assign, assign)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.assign(x)
+        out = paddle.fluid.layers.elementwise_add(x, x)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestAssignFp32Value(TestBase):
@@ -107,51 +66,13 @@ def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.assign_fp32 = data.astype(np.float32)
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                assign = paddle.assign(self.assign_fp32)
-                out = paddle.fluid.layers.elementwise_add(x, assign)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        assign = paddle.assign(self.assign_fp32)
+        out = paddle.fluid.layers.elementwise_add(x, assign)
+        self.fetch_list = [out.name]
 
 
 class TestAssignBoolValue(TestBase):
@@ -162,52 +83,15 @@ def set_data_feed(self):
         data = np.random.choice([True, False], size=(2, 3, 1))
         self.assign_bool = data.astype(np.bool)
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                x = paddle.less_than(x, x)
-                assign = paddle.assign(self.assign_bool)
-                out = paddle.logical_and(x, assign)
-                out = paddle.cast(out, 'float32')
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.less_than(x, x)
+        assign = paddle.assign(self.assign_bool)
+        x = paddle.logical_and(x, assign)
+        out = paddle.cast(x, 'float32')
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index f34e5b0d8b9dc..cf494034fd86f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,10 +29,6 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 2e-6
         self.rtol = 1e-5
@@ -48,67 +44,32 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                x = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-
-                fetch_list = [x.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                ipu_strategy.set_options({'need_avg_shard': True})
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        self.fetch_list = [x.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        ipu_strategy.set_options({'need_avg_shard': True})
+        self.run_op_test(exec_mode, ipu_strategy)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index c640cd441f1b2..adb2abfc47418 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-5
@@ -56,61 +51,24 @@ def set_op_attrs(self):
         self.attrs['data_layout'] = 'NCHW'
         self.attrs['in_place'] = False
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                out = paddle.fluid.layers.batch_norm(conv1, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        x = paddle.fluid.layers.batch_norm(x, **self.attrs)
+        self.fetch_list = [x.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
deleted file mode 100644
index ef61e651b2ad9..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-import paddle
-import paddle.static
-
-paddle.enable_static()
-SEED = 2021
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestFunc(unittest.TestCase):
-    def _test_func(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        bps = 5
-        n = 1 if run_ipu else -1
-        c, h, w = 3, 10, 10
-        np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32)
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[n, c, h, w], dtype='float32')
-                conv2d = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-
-                out = conv2d
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [out.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=False)
-                ipu_strategy.set_pipelining_config(batches_per_step=bps)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program,
-                             feed={image.name: np_image},
-                             fetch_list=[out])
-            return result[0]
-
-    def test_func(self):
-        ipu_res = self._test_func(True)
-        cpu_res = self._test_func(False)
-
-        if np.prod(ipu_res.shape) == np.prod(cpu_res.shape):
-            ipu_res = ipu_res.reshape(cpu_res.shape)
-
-        self.assertTrue(np.allclose(ipu_res, cpu_res, atol=1e-4))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 2de23d95e1c96..4d412f2a79977 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -30,175 +30,81 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    def set_atol(self):
-        self.atol = 1e-3
+    @property
+    def fp16_enabled(self):
+        return False
 
     def set_data_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
-        }
+        data = np.random.uniform(size=[1, 3, 3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.cast(x, **self.attrs)
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        out = paddle.cast(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestEnableFp16(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def set_data_feed(self):
-        self.feed = {"x": np.array([1, 200, 3000, 40000]).astype('int32'), }
+        data = np.random.uniform(size=[1, 3, 3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.cast(x, **self.attrs)
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
 
 class TestDisableTransferCast(TestEnableFp16):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {"x": np.array([1, 200, 3000, 40000]).astype('int32'), }
+        data = np.random.uniform(size=[1, 3, 3, 3])
+        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp16 = {'x': data.astype(np.float16)}
 
     def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.cast(x, **self.attrs)
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                ipu_strategy.set_precision_config(enable_fp16=True)
-                ipu_strategy.set_options({"transfer_cast_op": False})
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        ipu_strategy.set_options({"transfer_cast_op": False})
+        self.run_op_test(exec_mode)
 
 
 class TestCase2(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
@@ -208,11 +114,8 @@ def set_op_attrs(self):
 
 
 class TestCase3(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
         }
 
@@ -222,11 +125,8 @@ def set_op_attrs(self):
 
 
 class TestCase4(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
         }
 
@@ -236,11 +136,8 @@ def set_op_attrs(self):
 
 
 class TestCase5(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
@@ -250,11 +147,8 @@ def set_op_attrs(self):
 
 
 class TestCase6(TestBase):
-    def set_atol(self):
-        self.atol = 1e-10
-
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
         }
 
@@ -273,7 +167,7 @@ def set_op_attrs(self):
 @unittest.skip('skip float16 to float32')
 class TestCase3(TestBase):
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
@@ -285,10 +179,11 @@ def set_op_attrs(self):
 @unittest.skip('int32 to int8 is not supported')
 class TestCase4(TestBase):
     def set_atol(self):
+        super().set_atol()
         self.atol = 1
 
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "x": np.random.randint(
                 low=1, high=100, size=[1, 3, 3, 3]).astype('int32'),
         }
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index c5a8090283940..a5410ab499082 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,14 +30,9 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.random.uniform(size=[1, 3, 10, 10])
         data2 = np.random.uniform(size=[1, 3, 10, 10])
-
         self.feed_fp32 = {
             'x': data1.astype(np.float32),
             'y': data2.astype(np.float32)
@@ -55,63 +49,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.concat([x, y], **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.concat([x, y], **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index ade54fda86929..e450621b11d34 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -26,26 +26,19 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
+        self.set_feed()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-6
         self.atol_fp16 = 1e-3
         self.rtol_fp16 = 1e-3
 
-    def set_data_feed(self):
+    def set_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
         self.feed_fp16 = {'in_0': data.astype(np.float16)}
-
-    def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
@@ -59,59 +52,22 @@ def set_op_attrs(self):
         self.attrs['groups'] = 1
         self.attrs['data_format'] = 'NCHW'
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.conv2d(image, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.fluid.layers.conv2d(x, **self.attrs)
+        self.fetch_list = [x.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index 3a21f0cb0079c..d035673e219df 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[3, 7])
         label = np.arange(3).reshape([3, 1])
@@ -53,81 +49,31 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {'soft_label': False, }
 
-    def np_nll_loss(self):
-        tmp = -np.log(self.feed_fp32['x'])
-        label = self.feed_fp32['label']
-        indice = [range(label.shape[0]), label.flatten()]
-        self.np_ref = tmp[indice]
-
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                if exec_mode != ExecutionMode.CPU_FP32:
-                    label = paddle.static.data(
-                        name=self.feed_list[1],
-                        shape=self.feed_shape[1],
-                        dtype='int32')
-                else:
-                    label = paddle.static.data(
-                        name=self.feed_list[1],
-                        shape=self.feed_shape[1],
-                        dtype='int64')
-
-                out = paddle.fluid.layers.cross_entropy(
-                    input=x, label=label, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed['label'] = feed['label'].astype(np.int32)
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        if on_ipu:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        else:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int64')
+        out = paddle.fluid.layers.cross_entropy(
+            input=x, label=label, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['label'] = self.feed_fp32['label'].astype(np.int32)
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-        self.np_nll_loss()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -142,7 +88,6 @@ class TestCase2(TestBase):
     def set_data_feed(self):
         x = np.random.uniform(size=[30, 70])
         label = np.arange(30).reshape([30, 1])
-
         self.feed_fp32 = {
             "x": x.astype(np.float32),
             "label": label.astype(np.int64)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
index 2f1d86daf0057..a0a145fb72b35 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -48,60 +48,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                out = paddle.fluid.layers.cumsum(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
index e34da7f70167a..4e3b03ffca068 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'x': data.astype(np.float32)}
@@ -51,60 +46,23 @@ def set_op_attrs(self):
             "dropout_implementation": "downgrade_in_infer"
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                dropout = paddle.fluid.layers.dropout(x, **self.attrs)
-                out = paddle.fluid.layers.elementwise_add(dropout, dropout)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x = paddle.fluid.layers.dropout(x, **self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x, x)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
index a9d6d2308326e..24082fe49bae5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
-                                                          IPUOpTest)
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -43,63 +42,24 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = self.op(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = self.op(x, y, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def run_test_base(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def test_case0(self):
         data_x = np.random.uniform(size=(2, 3, 4, 5))
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
index 5b18c73851324..56b9a73f08009 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.ones([1, 10])
         y = np.zeros([1, 10])
@@ -53,63 +49,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.equal(x, y, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.equal(x, y, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
index 966dfdef87b54..211aa4a61a5b8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"expand_times": [1, 2, 2]}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                out = paddle.fluid.layers.expand(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        out = paddle.fluid.layers.expand(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -116,53 +75,15 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="float32")
-
-                expand_times = paddle.fluid.layers.fill_constant(
-                    shape=[len(self.feed_shape[0])], dtype="int32", value=2)
-                out = paddle.fluid.layers.expand(
-                    x, expand_times=expand_times, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        expand_times = paddle.fluid.layers.fill_constant(
+            shape=[len(self.feed_shape[0])], dtype="int32", value=2)
+        out = paddle.fluid.layers.expand(
+            x, expand_times=expand_times, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
index 00b855a5a7a42..b3faabda3cdf2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 3, 1])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -46,60 +42,23 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {'fill_value': 0.3, 'dtype': 'float32'}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                x_fill = paddle.full_like(x, **self.attrs)
-                out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        x_fill = paddle.full_like(x, **self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x_fill, x_fill)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
index 3a1c202bf1133..ce457b7abeb5b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,17 +30,14 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
-        self.feed = {}
+        self.feed_fp32 = {}
+        self.feed_fp16 = {}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {
@@ -50,50 +47,21 @@ def set_op_attrs(self):
             'value': 0.3,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.fluid.layers.fill_constant(**self.attrs)
-                out = paddle.fluid.layers.elementwise_add(x, x)
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.fluid.layers.fill_constant(**self.attrs)
+        out = paddle.fluid.layers.elementwise_add(x, x)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
index 6f0cafc66805e..a8d530f6b77ad 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 2, 4, 6])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_op_attrs(self):
         self.attrs = {}
         self.attrs['axis'] = 1
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.flatten(x=x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.flatten(x=x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
deleted file mode 100644
index cd29ff705b88f..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import shutil
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestBase(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_data_feed()
-        self.set_feed_attr()
-        self.set_op_attrs()
-
-    def set_atol(self):
-        self.atol = 1e-6
-        self.rtol = 1e-5
-        self.atol_fp16 = 1e-2
-        self.rtol_fp16 = 1e-3
-
-    def set_data_feed(self):
-        data = np.random.uniform(size=[1, 3, 10, 10])
-        self.feed_fp32 = {"in_0": data.astype(np.float32)}
-        self.feed_fp16 = {"in_0": data.astype(np.float16)}
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
-
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'sgd'
-        self.attrs['path'] = 'model'
-        self.attrs['model_name'] = 'test'
-
-    def _test_save(self):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-        generator = paddle.fluid.unique_name.UniqueNameGenerator()
-        self.full_name = '/'.join(
-            [self.attrs['path'], self.attrs['model_name']])
-
-        with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-
-                    scale = paddle.fluid.layers.scale(
-                        x, scale=1.0, bias=0.0, bias_after_scale=True)
-                    conv = paddle.static.nn.conv2d(
-                        scale,
-                        num_filters=3,
-                        filter_size=3,
-                        bias_attr=False,
-                        name='conv2d')
-                    loss = paddle.mean(conv)
-
-                    if self.attrs['is_training']:
-                        if self.attrs['opt_type'] == 'sgd':
-                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
-                            sgd.minimize(loss)
-                        elif self.attrs['opt_type'] == 'adam':
-                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                            adam.minimize(loss)
-                        elif self.attrs['opt_type'] == 'lamb':
-                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
-                            lamb.minimize(loss)
-
-                fetch_list = [loss.name]
-
-                place = paddle.IPUPlace()
-                exe = paddle.static.Executor(place)
-                exe.run(startup_prog)
-
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(
-                        self.feed_list, fetch_list)
-
-                for _ in range(self.attrs['steps']):
-                    exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list)
-
-                paddle.static.save_inference_model(
-                    self.full_name, x, loss, exe, program=program.org_program)
-
-    def _test_load(self, run_ipu):
-        if run_ipu:
-            place = paddle.IPUPlace()
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-
-        [inference_program, feed_target_names, fetch_targets] = (
-            paddle.static.load_inference_model(self.full_name, exe))
-
-        if run_ipu:
-            feed_list = feed_target_names
-            fetch_list = [fetch_targets[0].name]
-            ipu_strategy = paddle.static.IpuStrategy()
-            ipu_strategy.set_graph_config(is_training=False)
-            ipu_strategy.set_precision_config(enable_fp16=True)
-            program = paddle.static.IpuCompiledProgram(
-                inference_program,
-                ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-        else:
-            program = inference_program
-
-        feed = self.feed_fp16 if run_ipu else self.feed_fp32
-        result = []
-        for i in range(10):
-            feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype)
-            out = exe.run(program, feed=feed, fetch_list=[fetch_targets])
-            result.append(out)
-
-        return np.array(result)
-
-    def test_base(self):
-        self._test_save()
-        cpu_res = self._test_load(False)
-        ipu_res = self._test_load(True).astype(np.float32)
-
-        self.assertTrue(
-            np.allclose(
-                cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16))
-
-        shutil.rmtree(self.attrs['path'], True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
index 71742deefcd2c..1d3b17dbc2dfc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py
@@ -16,9 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 5e-6
         self.rtol = 1e-5
@@ -54,80 +49,32 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                add1 = conv1 + conv2
-                conv3 = paddle.static.nn.conv2d(
-                    add1, num_filters=8, filter_size=8, bias_attr=False)
-                out = paddle.fluid.layers.relu(conv3, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
-
-
-class TestIntInput(IPUOpTest):
-    def setUp(self):
-        self.set_atol()
-        self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
-        self.set_op_attrs()
-
-    @property
-    def fp16_enabled(self):
-        return True
-
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        conv2 = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        add1 = conv1 + conv2
+        conv3 = paddle.static.nn.conv2d(
+            add1, num_filters=8, filter_size=8, bias_attr=False)
+        out = paddle.fluid.layers.relu(conv3, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestIntInput(TestBase):
     def set_data_feed(self):
         embedding = np.random.uniform(size=[10, 20])
         indice = np.array([1, 3, 5]).astype(np.int32)
@@ -140,71 +87,14 @@ def set_data_feed(self):
             "indice": indice,
         }
 
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
-        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
-
-    def set_op_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, exec_mode):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='int32')
-
-                out = paddle.fluid.layers.gather(x, index=y)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return np.array(result)
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        out = paddle.fluid.layers.gather(x, index=y)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
index 01a56fd14be04..bbf3ec0ffdfe6 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[10, 20])
         y = np.array([1, 3, 5])
@@ -47,63 +43,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='int32')
-
-                out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        out = paddle.fluid.layers.gather(x, index=y, **self.attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
index 602289f3f1904..e9721463876d0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -46,59 +42,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"approximate": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.gelu(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.gelu(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
index 281baeca09e47..b7567f60cc3a2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py
@@ -28,19 +28,26 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
         self.set_attrs()
+        self.set_training()
+
+    @property
+    def fp16_enabled(self):
+        return False
 
     def set_atol(self):
+        super().set_atol()
         self.atol = 1e-6
+        self.rtol = 1e-5
 
     def set_data_feed(self):
-        self.feed = {
+        self.feed_fp32 = {
             "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_attrs(self):
         self.attrs = {
@@ -48,76 +55,48 @@ def set_attrs(self):
             "weight_decay": 0.0,
         }
 
-    def _test_optimizer(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-        np.random.seed(self.SEED)
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                weight_decay = self.attrs['weight_decay']
-                # Only support ClipGradByGlobalNorm
-                clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
-                if self.attrs['optimizer'] == 'sgd':
-                    opt = paddle.optimizer.SGD(learning_rate=1e-1,
-                                               weight_decay=weight_decay,
-                                               grad_clip=clip)
-                elif self.attrs['optimizer'] == 'adam':
-                    opt = paddle.optimizer.Adam(
-                        learning_rate=1e-1,
-                        weight_decay=weight_decay,
-                        grad_clip=clip)
-                elif self.attrs['optimizer'] == 'lamb':
-                    opt = paddle.optimizer.Lamb(
-                        learning_rate=1e-1,
-                        lamb_weight_decay=weight_decay,
-                        grad_clip=clip)
-                else:
-                    raise ValueError(
-                        f"Not supported optimizer {self.attrs['optimizer']} for test"
-                    )
-                opt.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                loss_res = exe.run(program, feed=self.feed, fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 100
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            image, num_filters=3, filter_size=3, bias_attr=False)
+        loss = paddle.mean(conv1)
+        self.fetch_list = [loss.name]
+
+        weight_decay = self.attrs['weight_decay']
+        # Only support ClipGradByGlobalNorm
+        clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+        if self.attrs['optimizer'] == 'sgd':
+            opt = paddle.optimizer.SGD(learning_rate=1e-1,
+                                       weight_decay=weight_decay,
+                                       grad_clip=clip)
+        elif self.attrs['optimizer'] == 'adam':
+            opt = paddle.optimizer.Adam(
+                learning_rate=1e-1, weight_decay=weight_decay, grad_clip=clip)
+        elif self.attrs['optimizer'] == 'lamb':
+            opt = paddle.optimizer.Lamb(
+                learning_rate=1e-1,
+                lamb_weight_decay=weight_decay,
+                grad_clip=clip)
+        else:
+            raise ValueError(
+                f"Not supported optimizer {self.attrs['optimizer']} for test")
+        opt.minimize(loss)
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test_optimizer(True).flatten()
-        cpu_loss = self._test_optimizer(False).flatten()
-
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol))
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestAdam(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
index 934ad10142827..c499bb0bd5ff9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -28,73 +28,30 @@ def setUp(self):
         self.set_training()
         self.set_test_op()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.greater_than
 
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = self.op(x, y, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = self.op(x, y, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten().astype(np.int32)
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def run_test_base(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index 102e764cb2f17..bb984a8d90789 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 3e-6
         self.rtol = 1e-6
@@ -56,86 +52,36 @@ def set_op_attrs(self):
             "data_layout": 'NCHW',
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if self.is_training:
-                    ch = self.feed_shape[0][1]
-                    conv1 = paddle.static.nn.conv2d(
-                        x, num_filters=ch, filter_size=3, bias_attr=False)
-                    scale = paddle.ParamAttr(trainable=True)
-                    bias = paddle.ParamAttr(trainable=True)
-                    out = paddle.fluid.layers.nn.group_norm(
-                        conv1, param_attr=scale, bias_attr=bias, **self.attrs)
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                else:
-                    out = paddle.fluid.layers.nn.group_norm(
-                        x, param_attr=True, bias_attr=True, **self.attrs)
-
-                if self.is_training:
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            if mode > ExecutionMode.IPU_FP32 and self.is_training:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        if self.is_training:
+            ch = self.feed_shape[0][1]
+            conv1 = paddle.static.nn.conv2d(
+                x, num_filters=ch, filter_size=3, bias_attr=False)
+            scale = paddle.ParamAttr(trainable=True)
+            bias = paddle.ParamAttr(trainable=True)
+            out = paddle.fluid.layers.nn.group_norm(
+                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            out = paddle.fluid.layers.nn.group_norm(
+                x, param_attr=True, bias_attr=True, **self.attrs)
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -150,7 +96,7 @@ def set_op_attrs(self):
 class TestTrainCase1(TestBase):
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
@@ -170,7 +116,7 @@ def set_op_attrs(self):
 
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
index ed8f3950ace82..fa425cbf9f94a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-5
@@ -52,86 +48,37 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"epsilon": 1e-05}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if self.is_training:
-                    ch = self.feed_shape[0][1]
-                    conv1 = paddle.static.nn.conv2d(
-                        x, num_filters=ch, filter_size=3, bias_attr=False)
-                    scale = paddle.ParamAttr(trainable=True)
-                    bias = paddle.ParamAttr(trainable=True)
-                    out = paddle.fluid.layers.nn.instance_norm(
-                        conv1, param_attr=scale, bias_attr=bias, **self.attrs)
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                else:
-                    out = paddle.fluid.layers.nn.instance_norm(
-                        x, param_attr=True, bias_attr=True, **self.attrs)
-
-                if self.is_training:
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res)
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+
+        if self.is_training:
+            ch = self.feed_shape[0][1]
+            conv1 = paddle.static.nn.conv2d(
+                x, num_filters=ch, filter_size=3, bias_attr=False)
+            scale = paddle.ParamAttr(trainable=True)
+            bias = paddle.ParamAttr(trainable=True)
+            out = paddle.fluid.layers.nn.instance_norm(
+                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            out = paddle.fluid.layers.nn.instance_norm(
+                x, param_attr=True, bias_attr=True, **self.attrs)
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            if mode > ExecutionMode.IPU_FP32 and self.is_training:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTrainCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api.py
deleted file mode 100644
index a306a3f7725b5..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-
-paddle.enable_static()
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuShard(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_index : no
-
-        with paddle.static.ipu_shard_guard(index=1):
-            c = b + 1  # scale, ipu_index : 1
-            with paddle.static.ipu_shard_guard(index=2):
-                d = c * 2  # scale, ipu_index : 2
-            with paddle.static.ipu_shard_guard(index=3):
-                e = d + 3  # scale, ipu_index : 3
-                with paddle.static.ipu_shard_guard(index=1):
-                    e = e + 3  # scale, ipu_index : 1
-                    with paddle.static.ipu_shard_guard(index=2):
-                        e = e + 3  # scale, ipu_index : 2
-
-        with paddle.static.ipu_shard_guard(index=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_index : 1
-
-        with paddle.static.ipu_shard_guard(index=2):
-            g = f - 1  # scale, ipu_index : 2
-
-        h = g + 1  # scale, ipu_index : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_index"):
-                ipu_index_list.append(op.desc.attr("ipu_index"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestIpuPipeline(unittest.TestCase):
-    def _test(self):
-        # build graph
-        a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-        b = a + 2  # scale : scale * x + bias, ipu_stage : no
-
-        with paddle.static.ipu_shard_guard(stage=1):
-            c = b + 1  # scale, ipu_stage : 1
-            with paddle.static.ipu_shard_guard(stage=2):
-                d = c * 2  # scale, ipu_stage : 2
-            with paddle.static.ipu_shard_guard(stage=3):
-                e = d + 3  # scale, ipu_stage : 3
-                with paddle.static.ipu_shard_guard(stage=1):
-                    e = e + 3  # scale, ipu_stage : 1
-                    with paddle.static.ipu_shard_guard(stage=2):
-                        e = e + 3  # scale, ipu_stage : 2
-
-        with paddle.static.ipu_shard_guard(stage=1):
-            f = paddle.tensor.pow(e, 2.0)  # pow, ipu_stage : 1
-
-        with paddle.static.ipu_shard_guard(stage=2):
-            g = f - 1  # scale, ipu_stage : 2
-
-        h = g + 1  # scale, ipu_stage : no
-
-        ipu_index_list = []
-        main_prog = paddle.static.default_main_program()
-        for op in main_prog.global_block().ops:
-            if op.desc.has_attr("ipu_stage"):
-                ipu_index_list.append(op.desc.attr("ipu_stage"))
-
-        return ipu_index_list
-
-    def test_ipu_shard(self):
-        ipu_index_list = self._test()
-        expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2]
-
-        self.assertTrue(
-            np.allclose(
-                ipu_index_list, expected_ipu_index_list, atol=0))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index debd9ed19827c..45f75f1b4df81 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -73,10 +73,15 @@ def test_set_other_options(self):
             'autoReport.directory': 'path',
             'autoReport.all': 'true'
         }
+        options['random_seed'] = 1234
         for k, v in options.items():
             ipu_strategy.set_options({k: v})
             assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
 
+        # The custom logger need 2 int as inputs
+        logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
+        ipu_strategy.set_options({'compilation_progress_logger': logger})
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
index a52946bba1567..cab2fa3fde2cb 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 1e-6
         self.rtol = 1e-5
@@ -59,89 +55,48 @@ def set_op_attrs(self):
         }
         self.optimizer = None
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if self.is_training:
-                    ch = self.feed_shape[0][1]
-                    conv1 = paddle.static.nn.conv2d(
-                        x, num_filters=ch, filter_size=3, bias_attr=False)
-                    scale = paddle.ParamAttr(trainable=True)
-                    bias = paddle.ParamAttr(trainable=True)
-                    out = paddle.fluid.layers.nn.layer_norm(
-                        conv1, param_attr=scale, bias_attr=bias, **self.attrs)
-                else:
-                    scale = self.attrs['scale']
-                    bias = self.attrs['shift']
-                    out = paddle.fluid.layers.nn.layer_norm(
-                        x, param_attr=scale, bias_attr=bias, **self.attrs)
-                loss = paddle.mean(out)
-
-                fetch_list = [loss.name]
-
-                if self.is_training:
-                    optimizer = None
-                    if self.optimizer == 'sgd':
-                        optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
-                    elif self.optimizer == 'adam':
-                        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
-                    elif self.optimizer == 'lamb':
-                        optimizer = paddle.optimizer.Lamb(
-                            learning_rate=1e-2, lamb_weight_decay=0.0)
-                    if optimizer is not None:
-                        optimizer.minimize(loss)
-
-            if exec_mode:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=self.feed_fp32,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program,
-                                 feed=self.feed_fp32,
-                                 fetch_list=fetch_list)
-                return result[0]
-
-    def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        if self.is_training:
+            ch = self.feed_shape[0][1]
+            conv1 = paddle.static.nn.conv2d(
+                x, num_filters=ch, filter_size=3, bias_attr=False)
+            scale = paddle.ParamAttr(trainable=True)
+            bias = paddle.ParamAttr(trainable=True)
+            out = paddle.fluid.layers.nn.layer_norm(
+                conv1, param_attr=scale, bias_attr=bias, **self.attrs)
+            loss = paddle.mean(out)
+            self.fetch_list = [loss.name]
+        else:
+            scale = self.attrs['scale']
+            bias = self.attrs['shift']
+            out = paddle.fluid.layers.nn.layer_norm(
+                x, param_attr=scale, bias_attr=bias, **self.attrs)
+            self.fetch_list = [out.name]
+
+        if self.is_training:
+            optimizer = None
+            if self.optimizer == 'sgd':
+                optimizer = paddle.optimizer.SGD(learning_rate=1e-2)
+            elif self.optimizer == 'adam':
+                optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+            elif self.optimizer == 'lamb':
+                optimizer = paddle.optimizer.Lamb(
+                    learning_rate=1e-2, lamb_weight_decay=0.0)
+            if optimizer is not None:
+                optimizer.minimize(loss)
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 @unittest.skip('raise error')
@@ -188,33 +143,17 @@ def set_op_attrs(self):
         self.optimizer = 'sgd'
 
     def set_atol(self):
+        super().set_atol()
         self.atol = 1e-6
 
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
-
-
-class TestTrainCase2(TestBase):
-    def set_atol(self):
-        self.atol = 5e-4
-
-    def set_op_attrs(self):
-        self.attrs = {
-            "scale": True,
-            "shift": True,
-            "begin_norm_axis": 2,
-            "epsilon": 1e-05
-        }
-        self.optimizer = 'adam'
-
-    def set_training(self):
-        self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 class TestTrainCase3(TestBase):
     def set_atol(self):
+        super().set_atol()
         self.atol = 5e-3
 
     def set_op_attrs(self):
@@ -228,7 +167,7 @@ def set_op_attrs(self):
 
     def set_training(self):
         self.is_training = True
-        self.epoch = 10
+        self.epoch = 20
 
 
 # not support `layer_norm(x, param_attr=False, bias_attr=False, **self.attrs)`
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
index fad7516e442a7..c0e4865b3a627 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.nn.functional as F
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +31,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -49,59 +45,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = F.log_softmax(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = F.log_softmax(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
index 3f8472890d03e..725d2b3429a7f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,68 +29,32 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 20, 30528])
-        self.feed = {"in_0": data.astype('bool')}
+        self.feed_fp32 = {"in_0": data.astype('bool')}
+        self.feed_fp16 = {"in_0": data.astype('bool')}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
-
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype="bool")
-
-                out = paddle.fluid.layers.logical_not(x)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).astype(np.int32)
-
-        self.check(output_dict, check_shape=True)
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="bool")
+        out = paddle.fluid.layers.logical_not(x)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
index 05572a72ea8b2..55a2c08c1b5e7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -38,69 +38,38 @@ def set_test_op(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
-
-                out = self.op(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        y = paddle.static.data(
+            name=self.feed_list[1],
+            shape=self.feed_shape[1],
+            dtype=self.feed_dtype[1])
+        out = self.op(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).astype(np.int32)
-
-        self.check(output_dict, check_shape=True)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
         self.feed_dtype = ['bool', 'bool']
 
     def set_data_feed0(self):
         x = np.random.choice([True, False], size=(1, 3, 5, 5))
         y = np.random.choice([True, False], size=(1, 3, 5, 5))
-        self.feed = {
+        self.feed_fp32 = {
             "x": x.astype('bool'),
             "y": y.astype('bool'),
         }
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
index 4a877ddce4e3c..80636348cfad3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,19 +30,15 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
-        self.feed_cpu = {"x": data.astype(np.int64)}
-        self.feed_ipu = {"x": data.astype(np.int32)}
+        self.feed_fp32 = {"x": data.astype(np.int64)}
+        self.feed_fp16 = {"x": data.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {
@@ -53,76 +49,30 @@ def set_op_attrs(self):
             "dtype": 'float32'
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int64')
-
-                out = paddle.fluid.layers.embedding(x, **self.attrs)
-
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_cpu
-            if exec_mode > ExecutionMode.CPU_FP32:
-                feed = self.feed_ipu
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        out = paddle.fluid.layers.embedding(x, **self.attrs)
+        if self.is_training:
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['x'] = self.feed_fp32['x'].astype(np.int32)
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
-                                                  self.is_training):
-                break
-
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTrainCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
index da8048fb3205e..7f021a615afa0 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,19 +30,15 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]])
-        self.feed_cpu = {"x": x.astype(np.int64)}
-        self.feed_ipu = {"x": x.astype(np.int32)}
+        self.feed_fp32 = {"x": x.astype(np.int64)}
+        self.feed_fp16 = {"x": x.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_cpu.values()]
-        self.feed_list = list(self.feed_cpu.keys())
-        self.feed_dtype = [x.dtype for x in self.feed_cpu.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {
@@ -53,76 +49,31 @@ def set_op_attrs(self):
             "weight_attr": None
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int64')
-
-                embedding = paddle.nn.Embedding(**self.attrs)
-                out = embedding(x)
-
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_cpu
-            if exec_mode > ExecutionMode.CPU_FP32:
-                feed = self.feed_ipu
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        embedding = paddle.nn.Embedding(**self.attrs)
+        out = embedding(x)
+        if self.is_training:
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['x'] = self.feed_fp32['x'].astype(np.int32)
+        self.run_op_test(exec_mode)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or
-                                                  self.is_training):
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestTrainCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
index 58f018e2ae649..6641efde69473 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py
@@ -12,89 +12,75 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import numpy as np
 import unittest
-import sys
 import paddle
-import paddle.fluid as fluid
 import paddle.static
 from paddle.optimizer.lr import LRScheduler
-
-paddle.enable_static()
-SEED = 2021
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 class LR_New(LRScheduler):
-    def __init__(self, learning_rate=1.0, last_epoch=-1, verbose=False):
+    def __init__(self, learning_rate=1e-5, last_epoch=-1, verbose=False):
         super(LR_New, self).__init__(learning_rate, last_epoch, verbose)
 
     def get_lr(self):
-        self.base_lr = self.base_lr + 1
+        self.base_lr = self.base_lr + 1e-4
         self.last_epoch = self.last_epoch + 1
         return self.base_lr
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
-class TestConvNet(unittest.TestCase):
-    def _test(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                image = paddle.static.data(
-                    name='image', shape=[1, 3, 10, 10], dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
-                    image, num_filters=3, filter_size=3, bias_attr=False)
-                loss = paddle.mean(conv1)
-
-                sgd = paddle.optimizer.SGD(learning_rate=LR_New())
-                sgd.minimize(loss)
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = [image.name]
-                fetch_list = [loss.name]
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
-                                                                  fetch_list)
-            else:
-                program = main_prog
-
-            result = []
-            for epoch in range(100):
-                if hasattr(program, "lr_sheduler"):
-                    program.lr_sheduler.step()
-                loss_res = exe.run(program,
-                                   feed={image.name: np_image},
-                                   fetch_list=[loss])
-                result.append(loss_res)
-
-            return np.array(result)
+class TestConvNet(IPUOpTest):
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(
+            name='image', shape=[1, 3, 10, 10], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            image, num_filters=3, filter_size=3, bias_attr=False)
+        loss = paddle.mean(conv1)
+
+        opt = paddle.optimizer.Lamb(learning_rate=LR_New())
+        opt.minimize(loss)
+        self.feed_list = [image.name]
+        self.fetch_list = [loss.name]
+
+    def run_model(self, run_ipu=True):
+        self.build_model()
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if run_ipu:
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=True)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        result = []
+        for _ in range(100):
+            if hasattr(program, "lr_sheduler"):
+                program.lr_sheduler.step()
+            loss_res = exe.run(program,
+                               feed=self.feed,
+                               fetch_list=self.fetch_list)
+            result.append(loss_res)
+        return np.array(result)
 
     def test_training(self):
+        data = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed = {'image': data}
         # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1)
-        ipu_loss = self._test(True).flatten()
-        cpu_loss = self._test(False).flatten()
+        ipu_loss = self.run_model(True).flatten()
+        cpu_loss = self.run_model(False).flatten()
 
-        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-10))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
index 6929ded6ebf90..e7e4c000e16a2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[20, 30])
         y = np.random.uniform(size=[30, 20])
@@ -52,63 +48,25 @@ def set_op_attrs(self):
             "alpha": 1.0,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.matmul(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+
+        out = paddle.fluid.layers.matmul(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
index ddb06400540e3..0a273e91dd571 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py
@@ -26,7 +26,7 @@ def set_serialize_factor(serialize_factor):
     op._set_attr('serialize_factor', serialize_factor)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_ipu() or IPUOpTest.use_ipumodel(),
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
                  "core is not compiled with IPU")
 class TestBase(IPUOpTest):
     def setUp(self):
@@ -38,8 +38,8 @@ def setUp(self):
 
     def set_data_feed(self):
         self.feed = {
-            "x": np.random.uniform(size=[2048, 3072]).astype('float32'),
-            "y": np.random.uniform(size=[3072, 2048]).astype('float32'),
+            "x": np.random.uniform(size=[16, 32]).astype('float32'),
+            "y": np.random.uniform(size=[32, 16]).astype('float32'),
         }
 
     def set_feed_attr(self):
@@ -50,58 +50,47 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"transpose_x": False, "transpose_y": False}
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
-
-                # decrator maybe the best choice, but need to modify api
-                out = paddle.matmul(x, y, **self.attrs)
-                set_serialize_factor(4)
-
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        y = paddle.static.data(
+            name=self.feed_list[1],
+            shape=self.feed_shape[1],
+            dtype=self.feed_dtype[1])
+        # decrator maybe the best choice, but need to modify api
+        out = paddle.matmul(x, y, **self.attrs)
+        set_serialize_factor(4)
+        self.fetch_list = [out.name]
+
+    def run_model(self, run_ipu):
+        self.build_model()
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if run_ipu:
+            feed_list = self.feed_list
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(is_training=self.is_training)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog,
+                ipu_strategy=ipu_strategy).compile(feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+        result = exe.run(program, feed=self.feed, fetch_list=self.fetch_list)
+        return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
+        res0 = self.run_model(False)
+        res1 = self.run_model(True)
         self.assertTrue(
             np.allclose(
                 res0.flatten(), res1.flatten(), atol=self.atol))
-
         self.assertTrue(res0.shape == res1.shape)
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
index 9f1c115403adf..725f3243e0f3d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[2, 3])
         y = np.random.uniform(size=[3, 2])
@@ -48,63 +44,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"transpose_x": False, "transpose_y": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.matmul(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.matmul(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
index b9dd7358b7955..c0d7dd1fd171d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -46,59 +42,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.mean(x)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.mean(x)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
index a70550c1df702..9bdf233556012 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.static
 import paddle.nn.functional as F
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionModeFull
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -28,10 +28,7 @@ def setUp(self):
         self.set_atol()
         self.set_data_feed()
         self.set_feed_attr()
-
-    @property
-    def fp16_enabled(self):
-        return True
+        self.set_attrs()
 
     def set_atol(self):
         self.atol = 1e-6
@@ -42,7 +39,6 @@ def set_atol(self):
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 10, 27, 27])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
-        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
@@ -54,86 +50,126 @@ def dtype_check(self, program, to_fp16_var_names):
         for var_name in to_fp16_var_names:
             assert (block.var(var_name).dtype, paddle.float16)
 
-    def _test_base(self, exec_mode):
-        generator = paddle.fluid.unique_name.UniqueNameGenerator()
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-
-                    # using fp32
-                    x = paddle.static.nn.conv2d(
-                        input=x, num_filters=3, filter_size=3)
-                    x = paddle.static.nn.batch_norm(x, act='relu')
-                    x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp16
-                    with paddle.static.amp.fp16_guard():
-                        x = paddle.static.nn.conv2d(
-                            input=x, num_filters=6, filter_size=3)
-                        x = paddle.static.nn.batch_norm(x, act='relu')
-                        x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp32
-                    x = paddle.static.nn.fc(x, size=10)
-                    loss = paddle.mean(x)
-                    fetch_list = [loss.name]
-
-                    if exec_mode == ExecutionModeFull.CPU_FP32:
-                        place = paddle.CPUPlace()
-                    else:
-                        place = paddle.IPUPlace()
-
-                    # cast model to fp16
-                    if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                        to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
-                            main_prog, self.amp_list)
-                        self.dtype_check(main_prog, to_fp16_var_names)
-
-                    exe = paddle.static.Executor(place)
-                    exe.run(startup_prog)
-
-                    # cast parameters to fp16
-                    if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                        paddle.static.amp.cast_parameters_to_fp16(
-                            paddle.CPUPlace(),
-                            main_prog,
-                            to_fp16_var_names=to_fp16_var_names)
-
-                    if exec_mode != ExecutionModeFull.CPU_FP32:
-                        ipu_strategy = paddle.static.IpuStrategy()
-                        ipu_strategy.set_graph_config(is_training=False)
-                        if exec_mode == ExecutionModeFull.IPU_POPART_FP16:
-                            ipu_strategy.set_precision_config(enable_fp16=True)
-                        program = paddle.static.IpuCompiledProgram(
-                            main_prog, ipu_strategy=ipu_strategy).compile(
-                                self.feed_list, fetch_list)
-                    else:
-                        program = main_prog
-
-                    feed = self.feed_fp32
-                    result = exe.run(program, feed=feed, fetch_list=fetch_list)
-                    return result[0]
+    def set_attrs(self):
+        self.num_ipus = 1
+        self.enable_pipelining = False
+        self.enable_manual_shard = False
+        self.batches_per_step = 1
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+
+        # using fp32
+        x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+        x = paddle.static.nn.batch_norm(x, act='relu')
+        x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp16
+        with paddle.static.amp.fp16_guard():
+            x = paddle.static.nn.conv2d(input=x, num_filters=6, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp32
+        x = paddle.static.nn.fc(x, size=10)
+        loss = paddle.mean(x)
+        self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode):
+        # cast model to fp16
+        if self.is_fp16_mode(exec_mode):
+            amp_list = paddle.static.amp.CustomOpLists()
+            amp_list.unsupported_list = {}
+            to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+                self.main_prog, amp_list, use_fp16_guard=True)
+            self.dtype_check(self.main_prog, to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            place = paddle.CPUPlace()
+        else:
+            place = paddle.IPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+
+        # cast parameters to fp16
+        if exec_mode == IPUOpTest.ExecutionMode.IPU_FP16:
+            paddle.static.amp.cast_parameters_to_fp16(
+                paddle.CPUPlace(),
+                self.main_prog,
+                to_fp16_var_names=to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                is_training=False,
+                num_ipus=self.num_ipus,
+                enable_manual_shard=self.enable_manual_shard)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=self.enable_pipelining,
+                batches_per_step=self.batches_per_step)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        result = exe.run(program,
+                         feed=self.feed_fp32,
+                         fetch_list=self.fetch_list)
+        self.output_dict[exec_mode] = result[0]
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model()
+            self.run_model(m)
+        self.check()
+
+
+class TestPipline(TestBase):
+    @IPUOpTest.static_graph
+    def build_model(self, exec_mode):
+        feed_shape = list(self.feed_shape[0])
+        if self.is_ipu_mode(exec_mode):
+            feed_shape[0] = 1
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=feed_shape, dtype='float32')
+        with paddle.static.ipu_shard_guard(index=0, stage=0):
+            # using fp32
+            x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=1, stage=1):
+            # using fp16
+            with paddle.static.amp.fp16_guard():
+                x = paddle.static.nn.conv2d(
+                    input=x, num_filters=6, filter_size=3)
+                x = paddle.static.nn.batch_norm(x, act='relu')
+                x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=2, stage=2):
+            # using fp32
+            x = paddle.static.nn.fc(x, size=10)
+            loss = paddle.mean(x)
+        self.fetch_list = [loss.name]
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[3, 10, 27, 27])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+
+    def set_attrs(self):
+        self.num_ipus = 3
+        self.enable_pipelining = True
+        self.enable_manual_shard = True
+        self.batches_per_step = 3
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionModeFull:
-            if mode == ExecutionModeFull.IPU_POPART_FP16:
-                continue
-            if mode > ExecutionModeFull.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model(m)
+            self.run_model(m)
+        # skip check results
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
index 224c0bddc22f9..c4ac9cddd7c3f 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.static
 import paddle.nn.functional as F
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionModeFull
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -29,10 +29,7 @@ def setUp(self):
         self.set_training()
         self.set_data_feed()
         self.set_feed_attr()
-
-    @property
-    def fp16_enabled(self):
-        return True
+        self.set_attrs()
 
     def set_atol(self):
         self.atol = 2e-6
@@ -47,104 +44,149 @@ def set_training(self):
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 28, 28])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
-        self.feed_fp16 = {"in_0": data.astype(np.float16)}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
+    def set_attrs(self):
+        self.num_ipus = 1
+        self.enable_pipelining = False
+        self.enable_manual_shard = False
+        self.batches_per_step = 1
+
     def dtype_check(self, program, to_fp16_var_names):
         block = program.global_block()
         assert len(to_fp16_var_names) > 0
         for var_name in to_fp16_var_names:
             assert (block.var(var_name).dtype, paddle.float16)
 
-    def _test_base(self, exec_mode):
-        generator = paddle.fluid.unique_name.UniqueNameGenerator()
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(
-                        name=self.feed_list[0],
-                        shape=self.feed_shape[0],
-                        dtype='float32')
-
-                    # using fp32
-                    x = paddle.static.nn.conv2d(
-                        input=x, num_filters=3, filter_size=3)
-                    x = paddle.static.nn.batch_norm(x, act='relu')
-                    x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp16
-                    with paddle.static.amp.fp16_guard():
-                        x = paddle.static.nn.conv2d(
-                            input=x, num_filters=6, filter_size=3)
-                        x = paddle.static.nn.batch_norm(x, act='relu')
-                        x = F.max_pool2d(x, kernel_size=2, stride=2)
-
-                    # using fp32
-                    x = paddle.static.nn.fc(x, size=10)
-                    loss = paddle.mean(x)
-
-                    # optimizer
-                    optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
-                    optimizer.minimize(loss, startup_prog)
-                    fetch_list = [loss.name]
-
-                # cast model to fp16
-                if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                    to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
-                        main_prog, self.amp_list)
-                    self.dtype_check(main_prog, to_fp16_var_names)
-
-                if exec_mode == ExecutionModeFull.CPU_FP32:
-                    place = paddle.CPUPlace()
-                else:
-                    place = paddle.IPUPlace()
-                exe = paddle.static.Executor(place)
-                exe.run(startup_prog)
-
-                # cast parameters to fp16
-                if exec_mode == ExecutionModeFull.IPU_MIXED_PRECISION:
-                    paddle.static.amp.cast_parameters_to_fp16(
-                        paddle.CPUPlace(),
-                        main_prog,
-                        to_fp16_var_names=to_fp16_var_names)
-
-                if exec_mode != ExecutionModeFull.CPU_FP32:
-                    ipu_strategy = paddle.static.IpuStrategy()
-                    ipu_strategy.set_graph_config(is_training=self.is_training)
-                    if exec_mode == ExecutionModeFull.IPU_POPART_FP16:
-                        ipu_strategy.set_precision_config(enable_fp16=True)
-                    program = paddle.static.IpuCompiledProgram(
-                        main_prog, ipu_strategy=ipu_strategy).compile(
-                            self.feed_list, fetch_list)
-                else:
-                    program = main_prog
-
-                feed = self.feed_fp32
-                result = []
-                for i in range(self.epoch):
-                    out = exe.run(program, feed=feed, fetch_list=fetch_list)
-                    result.append(out)
-                return np.array(result)
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionModeFull:
-            if mode == ExecutionModeFull.IPU_POPART_FP16:
-                continue
-            if mode > ExecutionModeFull.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+
+        # using fp32
+        x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+        x = paddle.static.nn.batch_norm(x, act='relu')
+        x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp16
+        with paddle.static.amp.fp16_guard():
+            x = paddle.static.nn.conv2d(input=x, num_filters=6, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        # using fp32
+        x = paddle.static.nn.fc(x, size=10)
+        loss = paddle.mean(x)
+
+        # optimizer
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+        optimizer.minimize(loss, self.startup_prog)
+        self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode):
+        # cast model to fp16
+        if self.is_fp16_mode(exec_mode):
+            amp_list = paddle.static.amp.CustomOpLists()
+            amp_list.unsupported_list = {}
+            to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
+                self.main_prog, amp_list)
+            self.dtype_check(self.main_prog, to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            place = paddle.CPUPlace()
+        else:
+            place = paddle.IPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+
+        # cast parameters to fp16
+        if self.is_fp16_mode(exec_mode):
+            paddle.static.amp.cast_parameters_to_fp16(
+                paddle.CPUPlace(),
+                self.main_prog,
+                to_fp16_var_names=to_fp16_var_names)
+
+        if self.is_ipu_mode(exec_mode):
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                is_training=self.is_training,
+                num_ipus=self.num_ipus,
+                enable_manual_shard=self.enable_manual_shard)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=self.enable_pipelining,
+                batches_per_step=self.batches_per_step)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        result = []
+        for _ in range(self.epoch):
+            out = exe.run(program,
+                          feed=self.feed_fp32,
+                          fetch_list=self.fetch_list)
+            result.append(out)
+        self.output_dict[exec_mode] = result
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model()
+            self.run_model(m)
+        self.check()
+
+
+class TestPipline(TestBase):
+    @IPUOpTest.static_graph
+    def build_model(self, exec_mode):
+        feed_shape = list(self.feed_shape[0])
+        if self.is_ipu_mode(exec_mode):
+            feed_shape[0] = 1
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=feed_shape, dtype='float32')
+
+        with paddle.static.ipu_shard_guard(index=0, stage=0):
+            # using fp32
+            x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
+            x = paddle.static.nn.batch_norm(x, act='relu')
+            x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=1, stage=1):
+            # using fp16
+            with paddle.static.amp.fp16_guard():
+                x = paddle.static.nn.conv2d(
+                    input=x, num_filters=6, filter_size=3)
+                x = paddle.static.nn.batch_norm(x, act='relu')
+                x = F.max_pool2d(x, kernel_size=2, stride=2)
+
+        with paddle.static.ipu_shard_guard(index=2, stage=2):
+            # using fp32
+            x = paddle.static.nn.fc(x, size=10)
+            loss = paddle.mean(x)
+
+        # optimizer
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+        optimizer.minimize(loss, self.startup_prog)
+        self.fetch_list = [loss.name]
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[5, 10, 27, 27])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+
+    def set_attrs(self):
+        self.num_ipus = 3
+        self.enable_pipelining = True
+        self.enable_manual_shard = True
+        self.batches_per_step = 5
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            self.build_model(m)
+            self.run_model(m)
+        # skip check results
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
index 7a9135626df79..583a8941ac62b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[2, 5])
         y = np.random.uniform(size=[5, 3])
@@ -51,63 +47,24 @@ def set_op_attrs(self):
             "y_num_col_dims": 1,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.mul(x, y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.mul(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
new file mode 100644
index 0000000000000..a4365c021ff3c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py
@@ -0,0 +1,130 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.zeros([1, 10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "y": y.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "y": y.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.not_equal(x, y, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.ones([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = np.arange(0, 10).reshape([1, 10])
+        self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestScalar(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.ones([1, 10])
+        y = 0.5
+        self.feed_fp32 = {"x": x.astype(np.float32), }
+        self.feed_fp16 = {"x": x.astype(np.float16), }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = (x != 0.5)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
index 33a5dc888c245..938654bfafc05 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,74 +30,34 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.array([[1], [1], [3], [0]])
-
-        self.feed = {'x': data1.astype(np.int32)}
+        self.feed_fp32 = {'x': data1.astype(np.int32)}
+        self.feed_fp16 = {'x': data1.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_op_attrs(self):
         self.attrs = {"depth": 4, "allow_out_of_range": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int32')
-
-                out = paddle.fluid.layers.one_hot(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int32')
+        out = paddle.fluid.layers.one_hot(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 @unittest.skip('does not support allow_out_of_range=True')
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
index 79fc9b04e1674..ec25f378866aa 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,74 +30,34 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.array([[1], [1], [3], [0]])
-
-        self.feed = {'x': data1.astype(np.int32)}
+        self.feed_fp32 = {'x': data1.astype(np.int32)}
+        self.feed_fp16 = {'x': data1.astype(np.int32)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
     def set_op_attrs(self):
         self.attrs = {"depth": 4, "allow_out_of_range": False}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int32')
-
-                out = paddle.fluid.input.one_hot(x, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled):
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int32')
+        out = paddle.fluid.input.one_hot(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 @unittest.skip('does not support allow_out_of_range=True')
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
index 43f54b52b5c55..060a69e83112a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import numpy as np
 import unittest
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
index 4288b82832ede..e5df11eb4ef8c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -56,59 +52,22 @@ def set_op_attrs(self):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pool2d(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -180,5 +139,21 @@ def set_attrs(self):
         self.attrs['exclusive'] = False
 
 
+class TestAdaptive(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "pool_size": 1,
+            "pool_type": 'avg',
+            "require_index": False
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
index 911a163b8aa9c..41b2b8406dc7e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {'in_0': data.astype(np.float32)}
@@ -56,59 +52,22 @@ def set_op_attrs(self):
             "data_format": 'NCHW',
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pool2d(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -179,5 +138,21 @@ def set_op_attrs(self):
         self.attrs['exclusive'] = False
 
 
+class TestAdaptive(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            "pool_size": 1,
+            "pool_type": 'max',
+            "require_index": False
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.adaptive_pool2d(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
index b3562d722c4e6..5ff1223961bb7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 2, 2])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"factor": 2.0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pow(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.pow(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -119,54 +78,14 @@ def set_data_feed(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                factor = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        factor = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
index c9454e5945f7d..3189e060d5837 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -30,82 +30,48 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
+    @property
+    def fp16_enabled(self):
+        return False
+
     def set_data_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
-        }
+        data = np.random.uniform(size=[1, 3, 3, 3]).astype('float32')
+        self.feed_fp32 = {"x": data.astype(np.float32)}
+        self.feed_fp16 = {"x": data.astype(np.float16)}
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [x.dtype for x in self.feed.values()]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
-                out = paddle.fluid.layers.conv2d(
-                    x, num_filters=3, filter_size=3)
-                out = paddle.fluid.layers.Print(out, **self.attrs)
-
-                if self.is_training:
-                    loss = paddle.mean(out)
-                    adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                    adam.minimize(loss)
-                    fetch_list = [loss.name]
-                else:
-                    fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            if self.is_training:
-                result = []
-                for _ in range(self.epoch):
-                    loss_res = exe.run(program,
-                                       feed=self.feed,
-                                       fetch_list=fetch_list)
-                    result.append(loss_res[0])
-                return np.array(result)
-            else:
-                result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-                return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0],
+            shape=self.feed_shape[0],
+            dtype=self.feed_dtype[0])
+        out = paddle.fluid.layers.conv2d(x, num_filters=3, filter_size=3)
+        out = paddle.fluid.layers.Print(out, **self.attrs)
+
+        if self.is_training:
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(learning_rate=1e-2)
+            adam.minimize(loss)
+            self.fetch_list = [loss.name]
+        else:
+            self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
     def test(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
-
-        self.assertTrue(res0.shape == res1.shape)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
index 929ee51b65094..93f96e08fd4b7 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -28,10 +28,6 @@ def setUp(self):
         self.set_training()
         self.set_test_op()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.reduce_mean
 
@@ -40,59 +36,22 @@ def set_feed_attr(self):
         self.feed_list = list(self.feed_fp32.keys())
         self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = self.op(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = self.op(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def run_test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict)
+    def run_test_base(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
     def set_data_feed0(self):
         data = np.random.uniform(size=[2, 4])
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
index 9ddf5c7537fdc..35be4d988273a 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -50,60 +46,23 @@ def set_op_attrs(self):
             "inplace": True,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                add = paddle.fluid.layers.elementwise_add(x, x)
-                out = paddle.fluid.layers.reshape(add, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        add = paddle.fluid.layers.elementwise_add(x, x)
+        out = paddle.fluid.layers.reshape(add, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
index 119771931701c..427e975402344 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[2, 4, 6])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -48,59 +44,22 @@ def set_op_attrs(self):
         self.attrs['shape'] = [6, 8]
         self.attrs['inplace'] = False
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.reshape(x=x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.reshape(x=x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index ba6eb4d38bcf2..c8f0961baa480 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -14,9 +14,11 @@
 
 import tempfile
 import unittest
+from functools import partial
 
 import numpy as np
 import paddle
+import paddle.optimizer
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
@@ -28,7 +30,8 @@ def setUp(self):
         self.set_atol()
         self.set_data_feed()
         self.set_feed_attr()
-        self.set_op_attrs()
+        self.set_attrs()
+        self.set_optimizer()
 
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
@@ -39,15 +42,16 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def set_op_attrs(self):
+    def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'sgd'
         self.attrs['enable_fp16'] = False
         self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.SGD, learning_rate=1e-1)
+
     def _test_base(self, save_otherwise_load):
         scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
@@ -71,16 +75,8 @@ def _test_base(self, save_otherwise_load):
                         name='conv2d')
                     loss = paddle.mean(conv1)
 
-                    if self.attrs['is_training']:
-                        if self.attrs['opt_type'] == 'sgd':
-                            sgd = paddle.optimizer.SGD(learning_rate=1e-2)
-                            sgd.minimize(loss)
-                        elif self.attrs['opt_type'] == 'adam':
-                            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-                            adam.minimize(loss)
-                        elif self.attrs['opt_type'] == 'lamb':
-                            lamb = paddle.optimizer.Lamb(learning_rate=1e-2)
-                            lamb.minimize(loss)
+                    # apply optimizer
+                    self.optimizer().minimize(loss)
                     fetch_list = [loss.name]
 
                 place = paddle.IPUPlace()
@@ -91,8 +87,7 @@ def _test_base(self, save_otherwise_load):
                     paddle.static.load(main_prog, self.attrs['model_path'].name)
 
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    is_training=self.attrs['is_training'])
+                ipu_strategy.set_graph_config(is_training=True)
                 ipu_strategy.set_precision_config(
                     enable_fp16=self.attrs['enable_fp16'])
                 ipu_program = paddle.static.IpuCompiledProgram(
@@ -131,62 +126,109 @@ def test_base(self):
         self.attrs['model_path'].cleanup()
 
 
+class TestMomentum(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Momentum, learning_rate=1e-1)
+
+
 class TestAdam(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'adam'
-        self.attrs['enable_fp16'] = False
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adam, learning_rate=1e-1)
 
 
 class TestLamb(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'lamb'
-        self.attrs['enable_fp16'] = False
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Lamb, learning_rate=1e-1)
+
+
+class TestAdamW(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.AdamW, learning_rate=1e-1)
+
+
+class TestAdamax(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adamax, learning_rate=1e-1)
+
+
+class TestAdagrad(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestAdadelta(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestRMSProp(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.RMSProp, learning_rate=1e-1)
+
+
+class TestCenteredRMSProp(TestBase):
+    def set_optimizer(self):
+        self.optimizer = partial(
+            paddle.optimizer.RMSProp, learning_rate=1e-1, centered=True)
 
 
 @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
 class TestSGDFP16(TestBase):
-    def set_op_attrs(self):
+    def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'sgd'
         self.attrs['enable_fp16'] = True
         self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.SGD, learning_rate=1e-1)
 
-@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
-class TestAdamFP16(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'adam'
-        self.attrs['enable_fp16'] = True
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
+class TestMomentumFp16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Momentum, learning_rate=1e-1)
 
-@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel")
-class TestLambFP16(TestBase):
-    def set_op_attrs(self):
-        self.attrs = {}
-        self.attrs['steps'] = 100
-        self.attrs['save_at_step'] = 20
-        self.attrs['is_training'] = True
-        self.attrs['opt_type'] = 'lamb'
-        self.attrs['enable_fp16'] = True
-        self.attrs['model_path'] = tempfile.TemporaryDirectory()
+
+class TestAdamFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adam, learning_rate=1e-1)
+
+
+class TestLambFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Lamb, learning_rate=1e-1)
+
+
+class TestAdamWFP16FP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.AdamW, learning_rate=1e-1)
+
+
+class TestAdamaxFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adamax, learning_rate=1e-1)
+
+
+class TestAdagradFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestAdadeltaFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.Adagrad, learning_rate=1e-1)
+
+
+class TestRMSPropFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(paddle.optimizer.RMSProp, learning_rate=1e-1)
+
+
+class TestCenteredRMSPropFP16(TestSGDFP16):
+    def set_optimizer(self):
+        self.optimizer = partial(
+            paddle.optimizer.RMSProp, learning_rate=1e-1, centered=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
index 49714eba8d4d1..f28bcba4cf0d9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -51,59 +51,22 @@ def set_op_attrs(self):
             "bias_after_scale": True,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.scale(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.scale(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -155,54 +118,14 @@ def set_op_attrs(self):
             "bias_after_scale": True,
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.scale(x, scale=y, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
new file mode 100644
index 0000000000000..113b316af4ea9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
@@ -0,0 +1,131 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_attrs()
+
+    def set_training(self):
+        self.is_training = True
+        self.epoch = 100
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10]).astype('float32')
+        self.feed_fp32 = {"image": data.astype(np.float32)}
+        self.feed_fp16 = {"image": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "scaled_optimizer_state": True
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        image = paddle.static.data(
+            name='image', shape=[1, 3, 10, 10], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            image, num_filters=3, filter_size=3, bias_attr=False)
+        loss = paddle.mean(conv1)
+
+        weight_decay = self.attrs['weight_decay']
+        opt = paddle.optimizer.Adam(
+            learning_rate=1e-1, weight_decay=weight_decay)
+        if self.attrs['optimizer'] == 'lamb':
+            opt = paddle.optimizer.Lamb(
+                learning_rate=1e-1, lamb_weight_decay=weight_decay)
+        opt.minimize(loss)
+        self.feed_list = [image.name]
+        self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=self.is_training)
+        if self.is_ipu_mode(exec_mode):
+            if "use_no_bias_optimizer" in self.attrs.keys():
+                ipu_strategy.set_options({
+                    "use_no_bias_optimizer": self.attrs["use_no_bias_optimizer"]
+                })
+            if "scaled_optimizer_state" in self.attrs.keys():
+                ipu_strategy.set_options({
+                    "scaled_optimizer_state":
+                    self.attrs["scaled_optimizer_state"]
+                })
+        self.run_op_test(exec_mode, ipu_strategy=ipu_strategy)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestScaledAdam(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "scaled_optimizer_state": True
+        }
+
+    def set_atol(self):
+        super().set_atol()
+        self.atol = 1e-5
+        self.rtol = 1e-5
+
+
+@unittest.skip('cpu do not support AdamNoBias')
+class TestScaledAdamNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'adam',
+            "weight_decay": 0.0,
+            "use_no_bias_optimizer": True,
+            "scaled_optimizer_state": True
+        }
+
+
+@unittest.skip('cpu do not support LambNoBias')
+class TestScaledLambNoBias(TestBase):
+    def set_attrs(self):
+        self.attrs = {
+            "optimizer": 'lamb',
+            "weight_decay": 0.0,
+            "use_no_bias_optimizer": True,
+            "scaled_optimizer_state": True
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
index 6702ae4344e91..5c61012cacece 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_atol(self):
         self.atol = 3e-6
         self.rtol = 1e-5
@@ -52,67 +48,32 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                conv1 = paddle.static.nn.conv2d(
-                    x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    conv1, num_filters=3, filter_size=3, bias_attr=False)
-                conv3 = paddle.static.nn.conv2d(
-                    conv2, num_filters=3, filter_size=3, bias_attr=False)
-                conv4 = paddle.static.nn.conv2d(
-                    conv3, num_filters=3, filter_size=3, bias_attr=False)
-
-            fetch_list = [conv4.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    is_training=self.is_training, micro_batch_size=2)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        conv1 = paddle.static.nn.conv2d(
+            x, num_filters=3, filter_size=3, bias_attr=False)
+        conv2 = paddle.static.nn.conv2d(
+            conv1, num_filters=3, filter_size=3, bias_attr=False)
+        conv3 = paddle.static.nn.conv2d(
+            conv2, num_filters=3, filter_size=3, bias_attr=False)
+        conv4 = paddle.static.nn.conv2d(
+            conv3, num_filters=3, filter_size=3, bias_attr=False)
+        self.fetch_list = [conv4.name]
+
+    def run_model(self, exec_mode):
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(
+            is_training=self.is_training, micro_batch_size=2)
+        self.run_op_test(exec_mode, ipu_strategy)
 
     def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
index 8881f018de3b5..ac8ef3e9d65ad 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[4, 5, 6])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -51,59 +47,22 @@ def set_op_attrs(self):
             "ends": [3, 2, 4],
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.slice(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.slice(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
@@ -135,54 +94,17 @@ def set_data_feed(self):
     def set_op_attrs(self):
         self.attrs = {"axes": [0, 1, 2]}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with fluid.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                starts = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='int32')
-                ends = paddle.static.data(
-                    name=self.feed_list[2],
-                    shape=self.feed_shape[2],
-                    dtype='int32')
-                out = paddle.fluid.layers.slice(
-                    x, starts=starts, ends=ends, **self.attrs)
-
-                fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        pass
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        starts = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        ends = paddle.static.data(
+            name=self.feed_list[2], shape=self.feed_shape[2], dtype='int32')
+        out = paddle.fluid.layers.slice(
+            x, starts=starts, ends=ends, **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index 25201959cecbc..0b2d776cf240b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 2, 20])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": -1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.softmax(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.softmax(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
new file mode 100644
index 0000000000000..cb1ed6ad93044
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 7])
+        label = np.arange(3).reshape([3, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {'soft_label': False, }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32")
+        if on_ipu:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int32')
+        else:
+            label = paddle.static.data(
+                name=self.feed_list[1], shape=self.feed_shape[1], dtype='int64')
+        out = F.softmax_with_cross_entropy(x, label, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        if self.is_ipu_mode(exec_mode):
+            self.feed_fp32['label'] = self.feed_fp32['label'].astype(np.int32)
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {
+            'soft_label': False,
+            'ignore_index': 1,
+        }
+
+
+class TestCase2(TestBase):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[30, 70])
+        label = np.arange(30).reshape([30, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
index 59af3a3d6ac17..63d9584dae37d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,13 +30,8 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data1 = np.random.uniform(size=[1, 3, 10, 10])
-
         self.feed_fp32 = {'x': data1.astype(np.float32)}
         self.feed_fp16 = {'x': data1.astype(np.float16)}
 
@@ -47,61 +42,24 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.split(x, **self.attrs)
-
-                fetch_list = [fetch.name for fetch in out]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled
-                ) or mode == ExecutionMode.IPU_POPART_FP16:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.split(x, **self.attrs)
+        self.fetch_list = [fetch.name for fetch in out]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        for k, v in self.output_dict.items():
+            self.output_dict[k] = np.concatenate([vv.flatten() for vv in v])
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
index bdc8fb32c8472..33950221ad5e8 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 1, 5])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axes": [0]}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.squeeze(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.squeeze(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
index c807ab9aab65e..11a827cee0948 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 2])
         y = np.random.uniform(size=[1, 2])
@@ -57,67 +53,26 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-                z = paddle.static.data(
-                    name=self.feed_list[2],
-                    shape=self.feed_shape[2],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        z = paddle.static.data(
+            name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32')
+        out = paddle.fluid.layers.stack([x, y, z], **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
index 12351cb63d6c8..fdc6ce08b6e15 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         x = np.random.uniform(size=[1, 3, 2, 2])
         y = np.random.uniform(size=[1, 3, 2, 2])
@@ -48,134 +44,52 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.sum([x, y], **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        out = paddle.fluid.layers.sum([x, y], **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode)
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict, check_shape=True)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
 
 
-@unittest.skip('')
 class TestCase1(TestBase):
-    def set_feed(self):
+    def set_data_feed(self):
         x = np.random.uniform(size=[1, 3, 2, 2])
         y = np.random.uniform(size=[1, 3, 2, 2])
         z = np.random.uniform(size=[1, 3, 2, 2])
         self.feed_fp32 = {
             "x": x.astype(np.float32),
             "y": y.astype(np.float32),
-            "z": y.astype(np.float32)
+            "z": z.astype(np.float32)
         }
         self.feed_fp16 = {
             "x": x.astype(np.float16),
             "y": y.astype(np.float16),
-            "z": y.astype(np.float16)
+            "z": z.astype(np.float16)
         }
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-                y = paddle.static.data(
-                    name=self.feed_list[1],
-                    shape=self.feed_shape[1],
-                    dtype='float32')
-                z = paddle.static.data(
-                    name=self.feed_list[2],
-                    shape=self.feed_shape[2],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    iipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        y = paddle.static.data(
+            name=self.feed_list[1], shape=self.feed_shape[1], dtype='float32')
+        z = paddle.static.data(
+            name=self.feed_list[2], shape=self.feed_shape[2], dtype='float32')
+        out = paddle.fluid.layers.sum([x, y, z], **self.attrs)
+        self.fetch_list = [out.name]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
index ef75aee78049b..c5331d43f5e55 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,10 +31,6 @@ def setUp(self):
         self.set_test_op()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_test_op(self):
         self.op = paddle.fluid.layers.topk
 
@@ -53,69 +49,35 @@ def set_op_attrs(self):
         if not self.use_k_as_const_variable:
             self.attrs["k"] = 3
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                if not self.use_k_as_const_variable:
-                    topk_values, topk_indices = self.op(x, **self.attrs)
-                else:
-                    # !important, popart cannot accept non const tensor
-                    K_t = paddle.fluid.layers.fill_constant(
-                        shape=[1], dtype='int32', value=self.k, name="in_2")
-                    topk_values, topk_indices = self.op(x, K_t, **self.attrs)
-
-                fetch_list = [topk_values.name, topk_indices.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result
-
-    def test_base(self):
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        if not self.use_k_as_const_variable:
+            topk_values, topk_indices = self.op(x, **self.attrs)
+        else:
+            # !important, popart cannot accept non const tensor
+            K_t = paddle.fluid.layers.fill_constant(
+                shape=[1], dtype='int32', value=self.k, name="in_2")
+            topk_values, topk_indices = self.op(x, K_t, **self.attrs)
+        self.fetch_list = [topk_values.name, topk_indices.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+
         value_dict = {}
         index_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            value, index = self._test_base(mode)
-            value_dict[mode] = value
-            index_dict[mode] = index
-
-        self.check(value_dict)
-        self.check(index_dict)
+        for k, v in self.output_dict.items():
+            value_dict[k] = v[0]
+            index_dict[k] = v[1]
+        self.check(output_dict=value_dict)
+        self.check(output_dict=index_dict)
 
 
 class TestCase2(TestTopKOp):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
index 1747bde20b6a6..d5fef73a31b3e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"perm": [0, 2, 3, 1]}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.transpose(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.transpose(x, **self.attrs)
+        self.fetch_list = [out.name]
 
-    def test(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
 
-        self.check(output_dict, check_shape=True)
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check(check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
index e068c2e3b5908..54cbc571ec6ff 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py
@@ -17,7 +17,7 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -30,10 +30,6 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
-    @property
-    def fp16_enabled(self):
-        return True
-
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 2, 3])
         self.feed_fp32 = {"x": data.astype(np.float32)}
@@ -47,59 +43,22 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {"axes": 0}
 
-    def _test_base(self, exec_mode):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='float32')
-
-                out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
-
-            fetch_list = [out.name]
-
-            if exec_mode == ExecutionMode.CPU_FP32:
-                place = paddle.CPUPlace()
-            else:
-                place = paddle.IPUPlace()
-
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if exec_mode != ExecutionMode.CPU_FP32:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=self.is_training)
-                if exec_mode == ExecutionMode.IPU_POPART_FP16:
-                    ipu_strategy.set_precision_config(enable_fp16=True)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_fp32
-            if exec_mode > ExecutionMode.IPU_FP32:
-                feed = self.feed_fp16
-
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
-
-    def test_base(self):
-        output_dict = {}
-        for mode in ExecutionMode:
-            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
-                break
-            output_dict[mode] = self._test_base(mode).flatten()
-
-        self.check(output_dict, check_shape=True)
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32')
+        out = paddle.fluid.layers.unsqueeze(x, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check(check_shape=True)
 
 
 class TestCase1(TestBase):
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
index ecf1c61f52e83..30e003917efbd 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py
@@ -50,72 +50,57 @@ def set_feed_attr(self):
     def set_op_attrs(self):
         self.attrs = {}
 
-    def _test_base(self, run_ipu=True):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
-
-        with paddle.static.scope_guard(scope):
-            with paddle.static.program_guard(main_prog, startup_prog):
-                x = paddle.static.data(
-                    name=self.feed_list[0],
-                    shape=self.feed_shape[0],
-                    dtype='int64')
-
-                with paddle.static.ipu_shard_guard(index=0, stage=0):
-                    y = paddle.fluid.layers.embedding(
-                        input=x,
-                        size=[768, 768],
-                        dtype='float32',
-                        param_attr=paddle.fluid.ParamAttr(
-                            name='word_embedding'),
-                        is_sparse=False)
-
-                with paddle.static.ipu_shard_guard(index=1, stage=1):
-                    z = paddle.fluid.layers.fc(
-                        input=y,
-                        size=768,
-                        param_attr=paddle.fluid.ParamAttr(name="fc"))
-
-                with paddle.static.ipu_shard_guard(index=0, stage=2):
-                    out = paddle.fluid.layers.matmul(
-                        x=z,
-                        y=main_prog.global_block().var('word_embedding'),
-                        transpose_y=True)
-
-            fetch_list = [out.name]
-
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
-                place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            exe.run(startup_prog)
-
-            if run_ipu:
-                feed_list = self.feed_list
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(
-                    num_ipus=2,
-                    is_training=self.is_training,
-                    enable_manual_shard=True)
-                ipu_strategy.set_pipelining_config(
-                    enable_pipelining=True, batches_per_step=3)
-                program = paddle.static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
-            else:
-                program = main_prog
-
-            feed = self.feed_ipu if run_ipu else self.feed_cpu
-            result = exe.run(program, feed=feed, fetch_list=fetch_list)
-            return result[0]
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(
+            name=self.feed_list[0], shape=self.feed_shape[0], dtype='int64')
+        with paddle.static.ipu_shard_guard(index=0, stage=0):
+            y = paddle.fluid.layers.embedding(
+                input=x,
+                size=[768, 768],
+                dtype='float32',
+                param_attr=paddle.fluid.ParamAttr(name='word_embedding'),
+                is_sparse=False)
+        with paddle.static.ipu_shard_guard(index=1, stage=1):
+            z = paddle.fluid.layers.fc(
+                input=y, size=768, param_attr=paddle.fluid.ParamAttr(name="fc"))
+        with paddle.static.ipu_shard_guard(index=0, stage=2):
+            out = paddle.fluid.layers.matmul(
+                x=z,
+                y=self.main_prog.global_block().var('word_embedding'),
+                transpose_y=True)
+        self.feed_list = [x.name]
+        self.fetch_list = [out.name]
+
+    def run_model(self, run_ipu):
+        self.build_model()
+        if run_ipu:
+            place = paddle.IPUPlace()
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+        if run_ipu:
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(
+                num_ipus=2,
+                is_training=self.is_training,
+                enable_manual_shard=True)
+            ipu_strategy.set_pipelining_config(
+                enable_pipelining=True, batches_per_step=3)
+            program = paddle.static.IpuCompiledProgram(
+                self.main_prog, ipu_strategy=ipu_strategy).compile(
+                    self.feed_list, self.fetch_list)
+        else:
+            program = self.main_prog
+
+        feed = self.feed_ipu if run_ipu else self.feed_cpu
+        result = exe.run(program, feed=feed, fetch_list=self.fetch_list)
+        return result[0]
 
     def test_base(self):
-        res0 = self._test_base(False)
-        res1 = self._test_base(True)
+        res0 = self.run_model(False)
+        res1 = self.run_model(True)
 
         self.assertTrue(
             np.allclose(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
new file mode 100644
index 0000000000000..828e92dc03426
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_detect_pass.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+from functools import partial
+import unittest
+
+from hypothesis import given, settings, seed, example, assume
+import hypothesis.strategies as st
+
+
+def product(input):
+    result = 1
+
+    for value in input:
+        result = result * value
+
+    return result
+
+
+class TestShuffleChannelMKLDNNDetectPass(PassAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        input_shape = program_config.inputs['input_data'].shape
+        first_reshape2_shape = program_config.ops[0].attrs['shape']
+        transpose2_axis = program_config.ops[1].attrs['axis']
+        second_reshape2_shape = program_config.ops[2].attrs['shape']
+
+        shape_prod = product(input_shape)
+        img_h = input_shape[-2]
+        img_w = input_shape[-1]
+
+        if shape_prod != product(first_reshape2_shape) or shape_prod != product(
+                second_reshape2_shape):
+            return False
+        if len(input_shape) != 4 or len(first_reshape2_shape) != 5 or len(
+                second_reshape2_shape) != 4:
+            return False
+        if transpose2_axis != [0, 2, 1, 3, 4]:
+            return False
+        if first_reshape2_shape[-1] != img_w or first_reshape2_shape[
+                -2] != img_h:
+            return False
+        if second_reshape2_shape[-1] != img_w or second_reshape2_shape[
+                -2] != img_h:
+            return False
+
+        return True
+
+    def sample_program_config(self, draw):
+        input_shape = draw(st.sampled_from([[128, 32, 32]]))
+        first_reshape2_shape = draw(
+            st.sampled_from([[2, 64, 32, 32], [8, 16, 32, 32]]))
+        transpose2_axis = draw(st.sampled_from([[0, 2, 1, 3, 4], [0, 2, 1, 3]]))
+        second_reshape2_shape = draw(
+            st.sampled_from([[128, 32, 32], [128, 31, 32]]))
+        batch_size = draw(st.integers(min_value=1, max_value=10))
+
+        input_shape.insert(0, batch_size)
+        first_reshape2_shape.insert(0, batch_size)
+        second_reshape2_shape.insert(0, batch_size)
+
+        def generate_input():
+            return np.random.random(input_shape).astype(np.float32)
+
+        ops_config = [{
+            "op_type": "reshape2",
+            "op_inputs": {
+                "X": ["input_data"]
+            },
+            "op_outputs": {
+                "Out": ["first_reshape2_output"],
+                "XShape": ["first_reshape2_xshape"]
+            },
+            "op_attrs": {
+                'shape': first_reshape2_shape
+            },
+        }, {
+            "op_type": "transpose2",
+            "op_inputs": {
+                "X": ["first_reshape2_output"]
+            },
+            "op_outputs": {
+                "Out": ["transpose2_output"],
+                "XShape": ["transpose2_xshape"]
+            },
+            "op_attrs": {
+                'axis': transpose2_axis
+            },
+        }, {
+            "op_type": "reshape2",
+            "op_inputs": {
+                "X": ["transpose2_output"],
+            },
+            "op_outputs": {
+                "Out": ["output_data"],
+                "XShape": ["second_reshape2_xshape"]
+            },
+            "op_attrs": {
+                'shape': second_reshape2_shape
+            }
+        }]
+
+        ops = self.generate_op_config(ops_config)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input))
+            },
+            outputs=["output_data"])
+
+        return program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, ["shuffle_channel"], (1e-5, 1e-5)
+
+    def test(self):
+        self.run_and_statis(
+            quant=False, passes=["shuffle_channel_mkldnn_detect_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index a2441b28bf96d..13c72bedefa8e 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import os
+import sys
 import unittest
 import warnings
 import numpy as np
@@ -37,20 +38,22 @@
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable, _current_expected_place
-from paddle.fluid.tests.unittests.testsuite import (
+from paddle.fluid import unique_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)))
+from testsuite import (
     create_op,
     set_input,
     append_input_output,
     append_loss_ops, )
-from paddle.fluid import unique_name
-from paddle.fluid.tests.unittests.white_list import (
+from white_list import (
     op_accuracy_white_list,
     check_shape_white_list,
     compile_vs_runtime_white_list,
     no_check_set_white_list,
     op_threshold_white_list,
     no_grad_set_white_list, )
-from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 
 # For switch new eager mode globally
 g_is_in_eager = _in_eager_without_dygraph_check()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 72240be41dd49..570551e82646f 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -52,6 +52,9 @@ def test_grad(self):
 
 
 class TestSigmoidDoubleGradCheck(unittest.TestCase):
+    def sigmoid_wrapper(self, x):
+        return fluid.layers.sigmoid(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -64,6 +67,8 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.sigmoid_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -75,6 +80,9 @@ def test_grad(self):
 
 
 class TestTanhTripleGradCheck(unittest.TestCase):
+    def tanh_wrapper(self, x):
+        return paddle.tanh(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -87,6 +95,8 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.triple_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.triple_grad_check_for_dygraph(
+            self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -98,6 +108,9 @@ def test_grad(self):
 
 
 class TestTanhDoubleGradCheck(unittest.TestCase):
+    def tanh_wrapper(self, x):
+        return paddle.tanh(x[0])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -110,6 +123,8 @@ def func(self, place):
         x_arr[np.abs(x_arr) < 0.005] = 0.002
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.tanh_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -146,6 +161,9 @@ def test_grad(self):
 
 
 class TestLeakyReluDoubleGradCheck(unittest.TestCase):
+    def leaky_relu_wrapper(self, x):
+        return paddle.nn.functional.leaky_relu(x[0], negative_slope=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 7, 9]
@@ -162,6 +180,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.leaky_relu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -173,6 +193,9 @@ def test_grad(self):
 
 
 class TestELUDoubleGradCheck(unittest.TestCase):
+    def elu_wrapper(self, x):
+        return paddle.nn.functional.elu(x[0], alpha=0.2)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 4, 4]
@@ -189,6 +212,8 @@ def func(self, place):
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.elu_wrapper, [x], y, x_init=x_arr, place=place)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_pos_op.py b/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
index 72924f242d211..46761063b8af2 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_pos_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.models.moe import utils
+from paddle.fluid.framework import _test_eager_guard
 
 
 def assign_pos(x, _cum_count):
@@ -117,7 +118,7 @@ def test_api_static(self):
                           fetch_list=[out])
             assert_allclose(res[0], self.out, self.cum_count)
 
-    def test_api_dygraph(self):
+    def func_api_dygraph(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         cum_count = paddle.to_tensor(self.cum_count).astype(x.dtype)
@@ -125,6 +126,11 @@ def test_api_dygraph(self):
         out = utils._assign_pos(x, cum_count)
         assert_allclose(out.numpy(), self.out, self.cum_count)
 
+    def test_api_dygraph(self):
+        with _test_eager_guard():
+            self.func_api_dygraph()
+        self.func_api_dygraph()
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index 0470a2df35f68..d9cb0ccf48209 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -235,12 +235,13 @@ def test_extremely_simple_net_with_op_in_condition(self):
         place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
-        ret = exe.run(main_program, fetch_list=[out, a.grad_name, b.grad_name])
+        ret = exe.run(main_program,
+                      fetch_list=[out, b, a.grad_name, b.grad_name])
         # Note: fill_constant has loss of precision, you have to assertEqual
         # with values doens't lose precision in float-point number.
-        self.assertEqual(ret[0][0], 1.25)
-        self.assertEqual(ret[1][0], 0.0)
-        self.assertEqual(ret[2][0], 1.0)
+        self.assertEqual(ret[0][0], ret[1][0])
+        self.assertEqual(ret[2][0], 0.0)
+        self.assertEqual(ret[3][0], 1.0)
 
 
 class TestCondNestedControlFlow(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 6a9f7a47f66cc..fdb93e1f1afdd 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -172,9 +172,9 @@ def test_check_grad_no_input(self):
 
 def create_test_cudnn_bf16_class(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
-        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
-    )
+        not core.is_compiled_with_cuda() or
+        not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16")
     class TestConv2DCUDNNBF16(parent):
         def get_numeric_grad(self, place, check_name):
             scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 6976019210283..6033b809f218d 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -108,7 +108,6 @@ def test_generator_randint_dygraph(self):
 
         if core.is_compiled_with_cuda():
             print(">>>>>>> randint dygraph >>>>>>>")
-            self.assertTrue(np.allclose(x1_np, x2_np))
             self.assertTrue(np.allclose(x_np, x3_np))
 
     def test_gen_TruncatedNormal_initializer(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py b/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
index a140bb5c79c93..7348783bd6748 100755
--- a/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_autotune.py
@@ -15,12 +15,14 @@
 from __future__ import print_function
 import unittest
 import numpy as np
-
+import tempfile
+import warnings
+import json
 import paddle
 import paddle.nn as nn
 from paddle.io import Dataset, DataLoader, BatchSampler, SequenceSampler
-from paddle.fluid.reader import set_autotune_config
 import sys
+import os
 
 
 class RandomDataset(Dataset):
@@ -51,12 +53,21 @@ def setUp(self):
         self.dataset = RandomDataset(10)
 
     def test_dataloader_use_autotune(self):
-        set_autotune_config(True, 1)
+        paddle.incubate.autotune.set_config(
+            config={"dataloader": {
+                "enable": True,
+                "tuning_steps": 1,
+            }})
         loader = DataLoader(
             self.dataset, batch_size=self.batch_size, num_workers=0)
 
     def test_dataloader_disable_autotune(self):
-        set_autotune_config(False)
+        config = {"dataloader": {"enable": False, "tuning_steps": 1}}
+        tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+        json.dump(config, tfile)
+        tfile.close()
+        paddle.incubate.autotune.set_config(tfile.name)
+        os.remove(tfile.name)
         loader = DataLoader(
             self.dataset, batch_size=self.batch_size, num_workers=2)
         if (sys.platform == 'darwin' or sys.platform == 'win32'):
@@ -65,12 +76,28 @@ def test_dataloader_disable_autotune(self):
             self.assertEqual(loader.num_workers, 2)
 
     def test_distributer_batch_sampler_autotune(self):
-        set_autotune_config(True, 1)
+        paddle.incubate.autotune.set_config(
+            config={"dataloader": {
+                "enable": True,
+                "tuning_steps": 1,
+            }})
         batch_sampler = paddle.io.DistributedBatchSampler(
             self.dataset, batch_size=self.batch_size)
         loader = DataLoader(
             self.dataset, batch_sampler=batch_sampler, num_workers=2)
 
 
+class TestAutoTuneAPI(unittest.TestCase):
+    def test_set_config_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            config = {"kernel": {"enable": 1, "tuning_range": True}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
+            self.assertTrue(len(w) == 2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 6a32a68db1be8..348945b73e1a4 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -481,11 +481,9 @@ def test_in_memory_dataset_run_2(self):
         dataset._set_fleet_send_sleep_seconds(2)
         dataset.preload_into_memory()
         dataset.wait_preload_done()
-        dataset.release_memory()
         dataset.preload_into_memory(1)
         dataset.wait_preload_done()
         dataset.dataset.merge_by_lineid()
-        dataset.release_memory()
         dataset._set_merge_by_lineid(30)
         dataset._set_parse_ins_id(False)
         dataset.load_into_memory()
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 99a46bfd9584d..b435975452009 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -19,8 +19,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDiffOp(unittest.TestCase):
@@ -55,7 +54,7 @@ def setUp(self):
         if core.is_compiled_with_cuda():
             self.places.append(paddle.CUDAPlace(0))
 
-    def test_dygraph(self):
+    def func_dygraph(self):
         for place in self.places:
             paddle.disable_static()
             x = paddle.to_tensor(self.input, place=place)
@@ -71,6 +70,13 @@ def test_dygraph(self):
                 append=self.append)
             self.assertTrue((out.numpy() == self.output).all(), True)
 
+    def test_dygraph(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_dygraph()
+        self.setUp()
+        self.func_dygraph()
+
     def test_static(self):
         paddle.enable_static()
         places = [fluid.CPUPlace()]
@@ -110,7 +116,7 @@ def test_static(self):
                                   fetch_list=[out])
                 self.assertTrue((fetches[0] == self.output).all(), True)
 
-    def test_grad(self):
+    def func_grad(self):
         for place in self.places:
             x = paddle.to_tensor(self.input, place=place, stop_gradient=False)
             if self.prepend is not None:
@@ -129,6 +135,13 @@ def test_grad(self):
             except:
                 raise RuntimeError("Check Diff Gradient Failed")
 
+    def test_grad(self):
+        with _test_eager_guard():
+            self.setUp()
+            self.func_grad()
+        self.setUp()
+        self.func_grad()
+
 
 class TestDiffOpAxis(TestDiffOp):
     def set_args(self):
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
index af99529adfa74..315580dd31ad7 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
@@ -34,7 +34,9 @@ def remove_file_if_exists(file_name):
         shutil.rmtree(file_name)
 
 
-def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
+def run_test(clip_after_allreduce=True,
+             max_global_norm=-1.0,
+             gradient_merge_steps=1):
     if not paddle.is_compiled_with_cuda():
         return
     if os.name == 'nt':
@@ -55,6 +57,7 @@ def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
 
     os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce)
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
+    os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
     touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid())
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
new file mode 100644
index 0000000000000..1822b77d0d0e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_distributed_fused_lamb_op_with_clip import run_test
+import unittest
+
+
+class TestDistributedFusedLambGradientMerge(unittest.TestCase):
+    def test_gm(self):
+        run_test(
+            clip_after_allreduce=True,
+            max_global_norm=-1.0,
+            gradient_merge_steps=2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 20abeaec7268c..e8d4fc260b87a 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -23,7 +23,6 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
-_enable_legacy_dygraph()
 import os
 
 from paddle import _C_ops
@@ -979,6 +978,7 @@ def test_backward_downscale_in_infer_eager(self):
                         ), self.cal_grad_downscale_in_infer(mask.numpy())))
 
     def test_backward_upscale_train(self):
+        _enable_legacy_dygraph()
         for place in self.places:
             with fluid.dygraph.guard(place):
 
@@ -1010,6 +1010,7 @@ def test_backward_upscale_train_eager(self):
                         ), self.cal_grad_upscale_train(mask.numpy(), prob)))
 
     def test_backward_upscale_train_2(self):
+        _enable_legacy_dygraph()
         for place in self.places:
             with fluid.dygraph.guard(place):
 
@@ -1025,6 +1026,23 @@ def test_backward_upscale_train_2(self):
                     np.allclose(input.gradient(
                     ), self.cal_grad_upscale_train(mask.numpy(), prob)))
 
+    def test_backward_upscale_train_2_eager(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                with _test_eager_guard():
+
+                    prob = 0.3
+                    input = paddle.uniform([40, 40], dtype="float32")
+                    input.stop_gradient = False
+                    out, mask = _C_ops.final_state_dropout(
+                        input, None, 0.3, False, "upscale_in_train", 0, False)
+
+                    out.backward()
+
+                    self.assertTrue(
+                        np.allclose(input.gradient(
+                        ), self.cal_grad_upscale_train(mask.numpy(), prob)))
+
 
 class TestRandomValue(unittest.TestCase):
     def test_fixed_random_number(self):
diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py
index 9c9cd883313a2..2abbcc98a6b7e 100644
--- a/python/paddle/fluid/tests/unittests/test_eigh_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py
@@ -19,8 +19,6 @@
 import paddle
 from op_test import OpTest
 from gradient_checker import grad_check
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 
 def valid_eigh_result(A, eigh_value, eigh_vector, uplo):
diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
new file mode 100644
index 0000000000000..63acaf6396913
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py
@@ -0,0 +1,468 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import contextlib
+import unittest
+import paddle
+from paddle.fluid import core
+
+import os
+os.environ['FLAGS_new_einsum'] = "1"
+
+
+def error_trans(func, *args, **kargs):
+    """ 
+    transport C++ exception into Python exception. 
+    because einsum_v2 raise different exception with einsum_v1.
+    """
+    try:
+        out = func(*args, **kargs)
+    except ValueError as e:
+        if "Same label have different shapes" in str(e):
+            raise AssertionError("Invalid operands: label i "
+                                 "corresponds to non-broadcastable dimensions.")
+
+
+class TestErrors(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_diagonalize_errors(self):
+        a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
+        a = paddle.to_tensor(a)
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
+            paddle.einsum('...ii->...i', a)
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
+            paddle.einsum('i...i', a)
+        with self.assertRaisesRegex(AssertionError,
+                                    ('Duplicate labels are not supported.')):
+            paddle.einsum('i...i->i...', a)
+
+    def test_param_errors(self):
+        a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float')
+        a = paddle.to_tensor(a)
+        with self.assertRaisesRegex(
+                AssertionError,
+            ("Required at least one operand in Einsum API, but received 0 ")):
+            paddle.einsum('ijk')
+        with self.assertRaisesRegex(AssertionError, (
+                'Invalid equation: multiple `->` were found.')):
+            paddle.einsum('i -> j -> k', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the number of operands is 2, "
+                "but found 3 segments in the label equation.")):
+            paddle.einsum('i,j,k', a, a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the number of operands is 2, "
+                "but found 1 segments in the label equation.")):
+            paddle.einsum('ij -> k', a, a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the number of operands is 1, "
+                "but found 2 segments in the label equation.")):
+            paddle.einsum('i, -> k', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the label string '' misses dimensions.")):
+            paddle.einsum('->', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: the label string 'i' misses dimensions.")):
+            paddle.einsum('i', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: _ is not a valid label, "
+                "which should be letters.")):
+            paddle.einsum('i_', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: `.` is found outside of an ellipsis.")):
+            paddle.einsum('i..j', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: `.` is found outside of an ellipsis.")):
+            paddle.einsum('...k...', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: missing ellipsis in output labels.")):
+            paddle.einsum('i...->i', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid equation: duplicate output labels are found.")):
+            paddle.einsum('i...->i...i', a)
+        with self.assertRaisesRegex(AssertionError, (
+                "Invalid operands: label i "
+                "corresponds to non-broadcastable dimensions.")):
+            error_trans(paddle.einsum, 'ij...,ji...', a, a)
+
+
+class TestEinsum(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+
+        cls.TEST_SAMPLES = {
+            "a": np.random.rand(1, 1),
+            "b": np.random.rand(1),
+            "x": np.random.rand(5),
+            "y": np.random.rand(7),
+            "A": np.random.rand(4, 5),
+            "B": np.random.rand(2, 5),
+            "C": np.random.rand(3, 7),
+            "D": np.random.rand(3, 4, 5),
+            "E": np.random.rand(3, 5, 2),
+            "F": np.random.rand(2, 4, 5, 3),
+            "G": np.random.rand(4, 2, 5),
+            "H": np.random.rand(3, 2, 4),
+            "I": np.random.rand(2, 2),
+            "J": np.random.rand(1, 3, 5),
+            "K": np.random.rand(1, 2, 3, 4),
+        }
+
+    def _get_place(self, force_to_use_cpu=False):
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
+        error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
+        self.assertTrue(
+            np.allclose(
+                actual, expect, rtol=rtol, atol=atol),
+            error_msg.format(paddle.get_device(), expect, actual,
+                             self.__class__.__name__))
+
+    def setUp(self):
+        self.sample = {"paradigm": "i->", "data": ["x"]}
+
+    def test_forward(self):
+        operands = [
+            TestEinsum.TEST_SAMPLES[operand] for operand in self.sample["data"]
+        ]
+        expected_result = np.einsum(self.sample["paradigm"], *operands)
+        equation = self.sample["paradigm"]
+
+        with paddle.fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=False)):
+            pd_operands = [paddle.to_tensor(operand) for operand in operands]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+
+        with paddle.fluid.dygraph.guard(self._get_place(force_to_use_cpu=True)):
+            pd_operands = [paddle.to_tensor(operand) for operand in operands]
+            result = paddle.einsum(equation, *pd_operands)
+            self.check_output_equal(result.numpy(), expected_result)
+
+
+class TestEinsumVectorDot(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,i->", "data": ["x", "x"]}
+
+
+class TestEinsumVectorMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,i->i", "data": ["x", "x"]}
+
+
+class TestEinsumVectorOuter(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i,j->ij", "data": ["x", "y"]}
+
+
+class TestEinsumMatrixTranspose(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->ji", "data": ["A"]}
+
+
+class TestEinsumMatrixRowSum(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->j", "data": ["A"]}
+
+
+class TestEinsumMatrixColSum(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij->i", "data": ["A"]}
+
+
+class TestEinsumMatrixEleMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,ij->ij", "data": ["A", "A"]}
+
+
+class TestEinsumDegenerateMatrixVecMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,j", "data": ["a", "b"]}
+
+
+class TestEinsumMatrixVecMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,j->i", "data": ["A", "x"]}
+
+
+class TestEinsumMatrixMul(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,kj->ik", "data": ["A", "B"]}
+
+
+class TestEinsumMatrixOuter(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,kl->ijkl", "data": ["A", "C"]}
+
+
+class TestEinsumTensorBMM(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "bij,bjk->bik", "data": ["D", "E"]}
+
+
+class TestEinsumTensorContract1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->i", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,lk->ijl", "data": ["D", "B"]}
+
+
+class TestEinsumTensorContract3(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "abcd,dfg->abcfg", "data": ["F", "D"]}
+
+
+class TestEinsumTensorContract4(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->ik", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract5(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk,jk->ij", "data": ["D", "A"]}
+
+
+class TestEinsumTensorContract6(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ik, ijk->j", "data": ["A", "G"]}
+
+
+class TestEinsumTensorContract7(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijk, ik->jk", "data": ["G", "A"]}
+
+
+class TestEinsumEllipsis1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "i...->...", "data": ["G"]}
+
+
+class TestEinsumEllipsis2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ij,...i->j...", "data": ["A", "H"]}
+
+
+class TestEinsumEllipsis3(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "k...,jk", "data": ["F", "I"]}
+
+
+class TestEinsumTestEinsumBilinear(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "bn,anm,bm->ba", "data": ["B", "E", "I"]}
+
+
+class TestEinsumTestEinsumOthers1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijkl, lmn->kmn", "data": ["F", "H"]}
+
+
+class TestEinsumTestEinsumOthers2(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "ijkl, lmn->ijn", "data": ["F", "H"]}
+
+
+class TestEinsumBatch1(TestEinsum):
+    def setUp(self):
+        self.sample = {"paradigm": "blq,bhlk->bhlqk", "data": ["J", "K"]}
+
+
+class TestNumpyTests(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    def check_output_equal(self, actual, expect, rtol=1.e-5, atol=1.e-8):
+        error_msg = 'Output has diff at place:{}. \nExpect: {} \nBut Got: {} in class {}'
+        self.assertTrue(
+            np.allclose(
+                actual, expect, rtol=rtol, atol=atol),
+            error_msg.format(paddle.get_device(), expect, actual,
+                             self.__class__.__name__))
+
+    def check_output(self, eqn, *ops):
+        expect = np.einsum(eqn, *ops)
+        with paddle.fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=False)):
+            pd_operands = [paddle.to_tensor(op) for op in ops]
+            actual = paddle.einsum(eqn, *pd_operands)
+            self.check_output_equal(actual.numpy(), expect)
+
+    def test_sums(self):
+        for n in range(1, 17):
+            a = np.arange(n).astype('float')
+            self.check_output("i->", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            self.check_output("...i->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * n).reshape(2, n).astype('float')
+            self.check_output("i...->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            self.check_output("i...->...", a)
+
+        for n in range(1, 17):
+            a = np.arange(3 * n).reshape(3, n).astype('float')
+            b = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            self.check_output("..., ...", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(2 * 3 * n).reshape(2, 3, n).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("...i, ...i", a, b)
+
+        for n in range(1, 11):
+            a = np.arange(n * 3 * 2).reshape(n, 3, 2).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("i..., i...", a, b)
+
+        for n in range(1, 17):
+            a = (np.arange(3) + 1).astype('float')
+            b = (np.arange(n) + 1).astype('float')
+            self.check_output("i,j", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("ij, j", a, b)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype('float')
+            b = np.arange(n).astype('float')
+            self.check_output("ji,j", a.T, b.T)
+
+        for n in range(1, 17):
+            a = np.arange(4 * n).reshape(4, n).astype('float')
+            b = np.arange(n * 6).reshape(n, 6).astype('float')
+            self.check_output("ij,jk", a, b)
+
+        a = np.arange(12).reshape(3, 4).astype('float')
+        b = np.arange(20).reshape(4, 5).astype('float')
+        c = np.arange(30).reshape(5, 6).astype('float')
+        self.check_output("ij,jk,kl", a, b, c)
+
+        a = np.arange(60).reshape(3, 4, 5).astype('float')
+        b = np.arange(24).reshape(4, 3, 2).astype('float')
+        self.check_output("ijk, jil -> kl", a, b)
+
+        for n in range(1, 25):
+            a = np.arange(n).astype('float')
+            self.check_output("...,...", a, a)
+            self.check_output("i,i", a, a)
+
+        # TODO(@xiongkun): explict broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #p = np.ones((10, 2)).astype('float')
+        #q = np.ones((1, 2)).astype('float')
+        #self.check_output('ij,ij->j', p, q)
+
+        # TODO(@xiongkun): explict-label-broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #x = np.array([2., 3.]).astype('float')
+        #y = np.array([4.]).astype('float')
+        #self.check_output("i, i", x, y)
+
+        # TODO(@xiongkun): explict-label-broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #p = np.ones((1, 5)) / 2
+        #q = np.ones((5, 5)) / 2
+        #self.check_output("...ij,...jk->...ik", p, p)
+        #self.check_output("...ij,...jk->...ik", p, q)
+
+        x = np.eye(2).astype('float')
+        y = np.ones(2).astype('float')
+        self.check_output("ji,i->", x, y)
+        self.check_output("i,ij->", y, x)
+        self.check_output("ij,i->", x, y)
+
+    def test_large_nops(self):
+        pass
+        # TODO(@xiongkun): explict broadcast in EinsumOp is not supported, it's not recommend to use einsum like this.
+        #a = np.arange(4 * 3 * 1 * 4).reshape(4, 3, 1, 4).astype('float')
+        #self.check_output('a...b,b...c,c...d', a, a, a)
+        #self.check_output('a...b,b...c,c...a', a, a, a)
+        #self.check_output('a...b,b...c,c...a', a, a, a)
+        #self.check_output('...ab,...ba,...ab,...ab', a, a, a, a)
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        fluid = paddle.fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.place = fluid.CUDAPlace(0)
+        else:
+            self.place = fluid.CPUPlace()
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            a = paddle.static.data(
+                name='a', shape=[3, None, None, None], dtype='float')
+            b = paddle.static.data(
+                name='b', shape=[2, None, None, None], dtype='float')
+            c = paddle.static.data(
+                name='c', shape=[None, None, 2, None], dtype='float')
+            d = paddle.static.data(
+                name='d', shape=[None, None, 5], dtype='float')
+            e = paddle.static.data(
+                name='e', shape=[None, 2, None], dtype='float')
+
+            outs = []
+            outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b))
+            outs.append(paddle.einsum('...ik, ...j', c, d))
+            outs.append(paddle.einsum('...kj, ...ik', d, e))
+            outs.append(paddle.einsum('ijk..., ikj', c, e))
+            outs.append(paddle.einsum('ijk..., ikj->...ij', c, e))
+        exe = fluid.Executor(self.place)
+        exe.run(startup)
+        a = np.arange(72).reshape(3, 2, 3, 4).astype('float')
+        b = np.arange(48).reshape(2, 2, 3, 4).astype('float')
+        c = np.arange(48).reshape(2, 3, 2, 4).astype('float')
+        d = np.arange(30).reshape(2, 3, 5).astype('float')
+        e = np.arange(12).reshape(2, 2, 3).astype('float')
+        feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e}
+        actual = exe.run(main, feed=feeds, fetch_list=[outs])
+        expect = []
+        expect.append(np.einsum("ibnd,jbnd->bnij", a, b))
+        expect.append(np.einsum('...ik, ...j', c, d))
+        expect.append(np.einsum('...kj, ...ik', d, e))
+        expect.append(np.einsum('ijk..., ikj', c, e))
+        expect.append(np.einsum('ijk..., ikj->...ij', c, e))
+        for a, e in zip(actual, expect):
+            self.check_output_equal(a, e)
+
+
+if __name__ == "__main__":
+    u
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index d50241e58dea3..27dbd3752b550 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -60,9 +60,9 @@ def init_dtype(self):
         pass
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
-    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+@unittest.skipIf(not core.is_compiled_with_cuda() or
+                 not core.is_bfloat16_supported(core.CUDAPlace(0)),
+                 "core is not compiled with CUDA and not support the bfloat16")
 class TestElementwiseDivOpBF16(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
index 8f6f9851c7006..ccfed61185f0c 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py
@@ -139,6 +139,9 @@ def test_grad(self):
 
 
 class TestElementwiseSubDoubleGradCheck(unittest.TestCase):
+    def subtract_wrapper(self, x):
+        return paddle.subtract(x[0], x[1])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -156,6 +159,11 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.subtract_wrapper, [x, y],
+            out,
+            x_init=[x_arr, y_arr],
+            place=place)
 
     def test_grad(self):
         paddle.enable_static()
@@ -195,6 +203,9 @@ def test_grad(self):
 
 
 class TestElementwiseDivDoubleGradCheck(unittest.TestCase):
+    def divide_wrapper(self, x):
+        return paddle.divide(x[0], x[1])
+
     @prog_scope()
     def func(self, place):
         # the shape of input variable should be clearly specified, not inlcude -1.
@@ -213,6 +224,12 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps, atol=1e-3)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.divide_wrapper, [x, y],
+            out,
+            x_init=[x_arr, y_arr],
+            place=place,
+            atol=1e-3)
 
     def test_grad(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 18620f55367f6..d200b77eea83f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -919,7 +919,7 @@ def train(layer, loader, loss_fn, opt):
 
         # load_inference_model
         paddle.enable_static()
-        exe = paddle.static.Executor(paddle.CPUPlace())
+        exe = paddle.static.Executor()
         [inference_program, feed_target_names, fetch_targets] = (
             paddle.static.load_inference_model(path, exe))
         tensor_img = x
@@ -927,8 +927,8 @@ def train(layer, loader, loss_fn, opt):
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
         print("pred.numpy()", pred.numpy())
-        print("results", results)
-        self.assertTrue(np.allclose(pred.numpy(), results, atol=1.e-5))
+        print("result", results[0])
+        self.assertTrue(np.array_equal(pred.numpy(), results[0]))
         paddle.disable_static()
 
     def test_inference_save_load(self):
@@ -1254,18 +1254,17 @@ def train(self, enable_amp=True, amp_level='O1'):
 
     def test_bf16(self):
         def func_isinstance():
-            if fluid.core.is_compiled_with_cuda():
-                cudnn_version = paddle.device.get_cudnn_version()
-                if cudnn_version is not None and cudnn_version >= 8100:
-                    out_fp32 = self.train(enable_amp=False)
-                    out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
-                    out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
-                    self.assertTrue(
-                        np.allclose(
-                            out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
-                    self.assertTrue(
-                        np.allclose(
-                            out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
+            if fluid.core.is_compiled_with_cuda(
+            ) and fluid.core.is_bfloat16_supported(paddle.CUDAPlace(0)):
+                out_fp32 = self.train(enable_amp=False)
+                out_bf16_O1 = self.train(enable_amp=True, amp_level='O1')
+                out_bf16_O2 = self.train(enable_amp=True, amp_level='O2')
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1))
+                self.assertTrue(
+                    np.allclose(
+                        out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
 
         with _test_eager_guard():
             func_isinstance()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 3a9387082e680..52137b22a790c 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -1037,11 +1037,11 @@ def func_dirac(self):
             block = start_prog.global_block()
             self.assertEqual(len(block.ops), self.num_ops)
             self.assertEqual(block.ops[0].type, 'fill_constant')
-            self.assertEqual(block.ops[1].type, 'reshape')
+            self.assertEqual(block.ops[1].type, 'reshape2')
             self.assertEqual(block.ops[2].type, 'assign_value')
             self.assertEqual(block.ops[3].type, 'assign_value')
             self.assertEqual(block.ops[4].type, 'scatter')
-            self.assertEqual(block.ops[5].type, 'reshape')
+            self.assertEqual(block.ops[5].type, 'reshape2')
 
             exe = paddle.static.Executor()
             exe.run(start_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
index 83c8ced79b1e8..54f5e64fda4b6 100644
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_functional.py
@@ -19,8 +19,6 @@
 import paddle.nn.functional as F
 import paddle.fluid.initializer as I
 import unittest
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 
 class LabelSmoothTestCase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index c71ff4381028d..a1440f8587ab6 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -16,6 +16,10 @@
 import unittest
 import numpy
 import paddle.nn.functional as F
+import tempfile
+import warnings
+import json
+import os
 
 
 class SimpleNet(paddle.nn.Layer):
@@ -41,10 +45,18 @@ def forward(self, image):
 class LayoutAutoTune(unittest.TestCase):
     def use_autoune(self):
         if paddle.is_compiled_with_cuda():
-            paddle.fluid.core.enable_layout_autotune()
+            paddle.incubate.autotune.set_config(
+                config={"layout": {
+                    "enable": True
+                }})
             return paddle.fluid.core.use_layout_autotune()
         else:
-            paddle.fluid.core.disable_layout_autotune()
+            config = {"layout": {"enable": False}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
             return paddle.fluid.core.use_layout_autotune()
 
     def train(self, data_format):
@@ -103,7 +115,6 @@ def test_transpose_op_transposer(self):
     def test_flatten_op_transposer(self):
         if not self.use_autoune():
             return
-        paddle.fluid.core.enable_layout_autotune()
         conv = paddle.nn.Conv2D(3, 8, (3, 3))
         flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
         data = paddle.rand([1, 3, 16, 14])
@@ -119,5 +130,20 @@ def test_flatten_op_transposer(self):
         self.assertEqual(out.shape, [1, 112, 12])
 
 
+class TestAutoTuneAPI(unittest.TestCase):
+    def test_set_config_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            config = {"layout": {"enable": 1}}
+            # On linux, we can open the file again to read the content
+            # without closing the file, but on windows system, there is
+            # no permission to open it again without closing it.
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
+            self.assertTrue(len(w) == 1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lbfgs.py b/python/paddle/fluid/tests/unittests/test_lbfgs.py
index 2cad4822b28b1..bb3818747601f 100644
--- a/python/paddle/fluid/tests/unittests/test_lbfgs.py
+++ b/python/paddle/fluid/tests/unittests/test_lbfgs.py
@@ -21,9 +21,6 @@
 
 from paddle.incubate.optimizer.functional.lbfgs import minimize_lbfgs
 
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
-
 np.random.seed(123)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py b/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
index e5ec67d41f7ef..d273185ad185f 100644
--- a/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_limit_by_capacity_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from paddle.distributed.models.moe import utils
 from paddle.fluid import core
+from paddle.fluid.framework import _test_eager_guard
 
 
 def limit_by_capacity(expert_count, _capacity, n_worker):
@@ -77,7 +78,7 @@ def test_static_api(self):
 
         assert all_close(self.out, res[0], self.n_worker)
 
-    def test_dygraph_api(self):
+    def func_dygraph_api(self):
         paddle.disable_static(self.place)
         capacity = paddle.to_tensor(self.capacity)
         expert_count_tensor = paddle.to_tensor(self.expert_count)
@@ -85,6 +86,11 @@ def test_dygraph_api(self):
                                        self.n_worker)
         assert all_close(self.out, out.numpy(), self.n_worker)
 
+    def test_dygraph_api(self):
+        with _test_eager_guard():
+            self.func_dygraph_api()
+        self.func_dygraph_api()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 492f300e3b848..3e06b69278d34 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -385,9 +385,9 @@ def test_check_grad(self):
 
 def create_test_bf16_class(parent, atol=0.01):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
-        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
-    )
+        not core.is_compiled_with_cuda() or
+        not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and not support the bfloat16")
     class TestMatMulOpBf16Case(parent):
         def get_numeric_grad(self, place, check_name):
             scope = core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 9b11f6711afc1..84559048a2b8a 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -20,8 +20,6 @@
 import sys
 import subprocess
 import paddle
-from paddle.fluid.framework import _enable_legacy_dygraph
-_enable_legacy_dygraph()
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index 49fe397644dc6..1452b869d4f8b 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -43,6 +43,7 @@ def func(self, place):
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
 
     def test_grad(self):
+        paddle.enable_static()
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -77,6 +78,14 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = False
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 1
+
+    def batch_norm_wrapper(self, x):
+        batch_norm = paddle.nn.BatchNorm2D(
+            self.shape[self.channel_index],
+            data_format=self.data_layout,
+            use_global_stats=self.use_global_stats)
+        return batch_norm(x[0])
 
     @prog_scope()
     def func(self, place):
@@ -94,8 +103,15 @@ def func(self, place):
             x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
             gradient_checker.double_grad_check(
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+            gradient_checker.double_grad_check_for_dygraph(
+                self.batch_norm_wrapper, [x],
+                z,
+                x_init=x_arr,
+                atol=atol,
+                place=place)
 
     def test_grad(self):
+        paddle.enable_static()
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
@@ -108,6 +124,7 @@ def init_test(self):
         self.data_layout = 'NHWC'
         self.use_global_stats = False
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 3
 
 
 class TestBatchNormDoubleGradCheckCase2(TestBatchNormDoubleGradCheck):
@@ -115,6 +132,7 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = True
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 1
 
 
 class TestBatchNormDoubleGradCheckCase3(TestBatchNormDoubleGradCheck):
@@ -122,6 +140,7 @@ def init_test(self):
         self.data_layout = 'NHWC'
         self.use_global_stats = True
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 3
 
 
 class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
@@ -129,6 +148,14 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = False
         self.shape = [2, 2, 3, 4, 5]
+        self.channel_index = 1
+
+    def batch_norm_wrapper(self, x):
+        batch_norm = paddle.nn.BatchNorm3D(
+            self.shape[self.channel_index],
+            data_format=self.data_layout,
+            use_global_stats=self.use_global_stats)
+        return batch_norm(x[0])
 
 
 class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck):
@@ -165,8 +192,8 @@ def init_test(self):
         self.data_layout = 'NCHW'
         self.use_global_stats = True
         self.shape = [2, 3, 4, 5]
+        self.channel_index = 1
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py
index 9eb89dfeb0e8d..bb09b8c6512f7 100644
--- a/python/paddle/fluid/tests/unittests/test_number_count_op.py
+++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.models.moe import utils
+from paddle.fluid.framework import _test_eager_guard
 
 
 def count(x, upper_num):
@@ -68,12 +69,17 @@ def test_api_static(self):
             res = exe.run(feed={'x': self.x}, fetch_list=[out])
             assert np.allclose(res, self.out)
 
-    def test_api_dygraph(self):
+    def func_api_dygraph(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         out = utils._number_count(x, self.upper_num)
         assert np.allclose(out.numpy(), self.out)
 
+    def test_api_dygraph(self):
+        with _test_eager_guard():
+            self.func_api_dygraph()
+        self.func_api_dygraph()
+
 
 if __name__ == '__main__':
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py b/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
index d9d110f45ff79..8a641a6b4faf9 100644
--- a/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prune_gate_by_capacity_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 from paddle.distributed.models.moe import utils
 from paddle.fluid import core
+from paddle.fluid.framework import _test_eager_guard
 
 
 def count(x, upper_num):
@@ -102,7 +103,7 @@ def test_static_api(self):
                           fetch_list=out)
         assert_allclose(res[0], self.out, self.n_expert)
 
-    def test_dygraph_api(self):
+    def func_dygraph_api(self):
         paddle.disable_static(self.place)
         gate_idx_tensor = paddle.to_tensor(self.gate_idx)
         expert_count_tensor = paddle.to_tensor(self.expert_count)
@@ -110,6 +111,11 @@ def test_dygraph_api(self):
             gate_idx_tensor, expert_count_tensor, self.n_expert, self.n_worker)
         assert_allclose(out.numpy(), self.out, self.n_expert)
 
+    def test_dygraph_api(self):
+        with _test_eager_guard():
+            self.func_dygraph_api()
+        self.func_dygraph_api()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_random_routing_op.py b/python/paddle/fluid/tests/unittests/test_random_routing_op.py
index dc8f6f5fcec15..e4bb7c5ca5fd8 100644
--- a/python/paddle/fluid/tests/unittests/test_random_routing_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_routing_op.py
@@ -24,6 +24,7 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.models.moe import utils
+from paddle.fluid.framework import _test_eager_guard
 
 
 def random_routing(topk_idx, topk_value, prob, topk=2):
@@ -55,7 +56,7 @@ def init(self):
                                   self.prob).astype(self.dtype)
         self.place = paddle.CUDAPlace(0)
 
-    def test_api_dygraph(self):
+    def func_api_dygraph(self):
         paddle.disable_static()
         x = paddle.to_tensor(self.x)
         value = paddle.to_tensor(self.topk_value)
@@ -63,6 +64,11 @@ def test_api_dygraph(self):
         out = utils._random_routing(x, value, prob)
         assert np.allclose(out.numpy(), self.out)
 
+    def test_api_dygraph(self):
+        with _test_eager_guard():
+            self.func_api_dygraph()
+        self.func_api_dygraph()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
index a1a3849f7191b..8d65a4c4444d4 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
@@ -19,6 +19,7 @@
 import paddle.fluid.core as core
 from paddle import _C_ops
 from paddle.fluid.framework import _test_eager_guard
+import copy
 
 
 class TestMaxPool3DFunc(unittest.TestCase):
@@ -44,23 +45,28 @@ def setUp(self):
     def test(self):
         with _test_eager_guard():
             self.setUp()
+            self.dense_x.stop_gradient = False
             sparse_x = self.dense_x.to_sparse_coo(4)
-            out = paddle.sparse.functional.max_pool3d(
+            sparse_out = paddle.sparse.functional.max_pool3d(
                 sparse_x,
                 self.kernel_sizes,
                 stride=self.strides,
                 padding=self.paddings)
-            out = out.to_dense()
+            out = sparse_out.to_dense()
+            out.backward(out)
 
+            dense_x = copy.deepcopy(self.dense_x)
             dense_out = paddle.nn.functional.max_pool3d(
-                self.dense_x,
+                dense_x,
                 self.kernel_sizes,
                 stride=self.strides,
                 padding=self.paddings,
                 data_format='NDHWC')
+            dense_out.backward(dense_out)
+
             #compare with dense
-            assert np.allclose(dense_out.flatten().numpy(),
-                               out.flatten().numpy())
+            assert np.allclose(dense_out.numpy(), out.numpy())
+            assert np.allclose(dense_x.grad.numpy(), self.dense_x.grad.numpy())
 
 
 class TestStride(TestMaxPool3DFunc):
diff --git a/python/paddle/fluid/tests/unittests/test_switch_autotune.py b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
index 1775272aac69d..0049a922b9166 100644
--- a/python/paddle/fluid/tests/unittests/test_switch_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_switch_autotune.py
@@ -15,6 +15,10 @@
 import paddle
 import unittest
 import numpy as np
+import tempfile
+import warnings
+import json
+import os
 
 
 class SimpleNet(paddle.nn.Layer):
@@ -73,10 +77,13 @@ def get_expected_res(self, step_id, enable_autotune):
         return expected_res
 
     def test_autotune(self):
-        paddle.fluid.core.disable_autotune()
+        paddle.incubate.autotune.set_config(
+            config={"kernel": {
+                "enable": False
+            }})
         self.assertEqual(self.get_flags("FLAGS_use_autotune"), False)
 
-        paddle.fluid.core.enable_autotune()
+        paddle.incubate.autotune.set_config(config={"kernel": {"enable": True}})
         self.assertEqual(self.get_flags("FLAGS_use_autotune"), True)
 
     def check_status(self, expected_res):
@@ -93,10 +100,16 @@ class TestDygraphAutoTuneStatus(TestAutoTune):
     def run_program(self, enable_autotune):
         self.set_flags(enable_autotune)
         if enable_autotune:
-            paddle.fluid.core.enable_autotune()
+            paddle.incubate.autotune.set_config(
+                config={"kernel": {
+                    "enable": True,
+                    "tuning_range": [1, 2]
+                }})
         else:
-            paddle.fluid.core.disable_autotune()
-        paddle.fluid.core.set_autotune_range(1, 2)
+            paddle.incubate.autotune.set_config(
+                config={"kernel": {
+                    "enable": False
+                }})
         x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.)
         net = SimpleNet()
         for i in range(3):
@@ -141,10 +154,18 @@ def run_program(self, enable_autotune):
 
         self.set_flags(enable_autotune)
         if enable_autotune:
-            paddle.fluid.core.enable_autotune()
+            config = {"kernel": {"enable": True, "tuning_range": [1, 2]}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
         else:
-            paddle.fluid.core.disable_autotune()
-        paddle.fluid.core.set_autotune_range(1, 2)
+            paddle.incubate.autotune.set_config(
+                config={"kernel": {
+                    "enable": False,
+                    "tuning_range": [1, 2]
+                }})
 
         for i in range(3):
             exe.run(program=main_program, feed={'X': x}, fetch_list=[loss])
@@ -166,5 +187,22 @@ def test_disable_autotune(self):
         self.func_disable_autotune()
 
 
+class TestAutoTuneAPI(unittest.TestCase):
+    def test_set_config_warnings(self):
+        with warnings.catch_warnings(record=True) as w:
+            config = {"kernel": {"enable": 1, "tuning_range": 1}}
+            tfile = tempfile.NamedTemporaryFile(mode="w+", delete=False)
+            json.dump(config, tfile)
+            tfile.close()
+            paddle.incubate.autotune.set_config(tfile.name)
+            os.remove(tfile.name)
+            self.assertTrue(len(w) == 2)
+
+    def test_set_config_attr(self):
+        paddle.incubate.autotune.set_config(config=None)
+        self.assertEqual(
+            paddle.get_flags("FLAGS_use_autotune")["FLAGS_use_autotune"], True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 4f836d94b34eb..c1891d24b88c9 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -34,6 +34,10 @@
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
+try:
+    from collections.abc import Iterable
+except:
+    from collections import Iterable
 
 __all__ = []
 
@@ -424,7 +428,7 @@ def _parse_every_object(obj, condition_func, convert_func):
     elif type(obj) == set:
         return set(_parse_every_object(list(obj), condition_func, convert_func))
     else:
-        if isinstance(obj, collections.Iterable) and not isinstance(
+        if isinstance(obj, Iterable) and not isinstance(
                 obj,
             (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
             raise NotImplementedError(
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index d8cc322a66e27..ff7a167f1a670 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -29,6 +29,7 @@
 from .tensor import segment_min
 from .passes import fuse_resnet_unit_pass
 import paddle.incubate.autograd
+import paddle.incubate.autotune
 
 from . import nn  #noqa: F401
 
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
new file mode 100644
index 0000000000000..e98a23bc52d65
--- /dev/null
+++ b/python/paddle/incubate/autotune.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import json
+import warnings
+from paddle.fluid import core
+
+__all__ = ['set_config']
+
+
+def set_config(config=None):
+    r"""
+    Set the configuration for kernel, layout and dataloader auto-tuning.
+
+    1. kernel: When it is enabled, exhaustive search method will be used to select
+    and cache the best algorithm for the operator in the tuning iteration. Tuning
+    parameters are as follows:
+
+    - enable(bool): Whether to enable kernel tuning.
+    - tuning_range(list): Start and end iteration for auto-tuning. Default: [1, 10].
+
+    2. layout: When it is enabled, the best data layout such as NCHW or NHWC will be
+    determined based on the device and data type. When the origin layout setting is
+    not best, layout transformation will be automaticly performed to improve model
+    performance. Layout auto-tuning only supports dygraph mode currently. Tuning
+    parameters are as follows:
+
+    - enable(bool): Whether to enable layout tuning.
+
+    3. dataloader: When it is enabled, the best num_workers will be selected to replace
+    the origin dataloader setting. Tuning parameters are as follows:
+
+    - enable(bool): Whether to enable dataloader tuning.
+
+    Args:
+        config (dict|str|None, optional): Configuration for auto-tuning. If it is a
+            dictionary, the key is the tuning type, and the value is a dictionary
+            of the corresponding tuning parameters. If it is a string, the path of
+            a json file will be specified and the tuning configuration will be set
+            by the the json file. Default: None, auto-tuning for kernel, layout and
+            dataloader will be enabled.
+
+    Examples:
+        .. code-block:: python
+            :name: auto-tuning
+
+            import paddle
+            import json
+
+            # config is a dict.
+            config = {
+                "kernel": {
+                    "enable": True,
+                    "tuning_range": [1, 5],
+                },
+                "layout": {
+                    "enable": True,
+                },
+                "dataloader": {
+                    "enable": True,
+                }
+            }
+            paddle.incubate.autotune.set_config(config)
+
+            # config is the path of json file.
+            config_json = json.dumps(config)
+            with open('config.json', 'w') as json_file:
+                json_file.write(config_json)
+            paddle.incubate.autotune.set_config('config.json')
+
+    """
+    if config is None:
+        core.enable_autotune()
+        core.enable_layout_autotune()
+        paddle.fluid.reader.set_autotune_config(use_autotune=True)
+        return
+
+    config_dict = {}
+    if isinstance(config, dict):
+        config_dict = config
+    elif isinstance(config, str):
+        try:
+            with open(config, 'r') as filehandle:
+                config_dict = json.load(filehandle)
+        except Exception as e:
+            print('Load config error: {}'.format(e))
+            warnings.warn("Use default configuration for auto-tuning.")
+
+    if "kernel" in config_dict:
+        kernel_config = config_dict["kernel"]
+        if "enable" in kernel_config:
+            if isinstance(kernel_config['enable'], bool):
+                if kernel_config['enable']:
+                    core.enable_autotune()
+                else:
+                    core.disable_autotune()
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the kernel is incorrect."
+                    "The `enable` should be bool. Use default parameter instead."
+                )
+        if "tuning_range" in kernel_config:
+            if isinstance(kernel_config['tuning_range'], list):
+                tuning_range = kernel_config['tuning_range']
+                assert len(tuning_range) == 2
+                core.set_autotune_range(tuning_range[0], tuning_range[1])
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the kernel is incorrect."
+                    "The `tuning_range` should be list. Use default parameter instead."
+                )
+    if "layout" in config_dict:
+        layout_config = config_dict["layout"]
+        if "enable" in layout_config:
+            if isinstance(layout_config['enable'], bool):
+                if layout_config['enable']:
+                    core.enable_layout_autotune()
+                else:
+                    core.disable_layout_autotune()
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the layout is incorrect."
+                    "The `enable` should be bool. Use default parameter instead."
+                )
+    if "dataloader" in config_dict:
+        dataloader_config = config_dict["dataloader"]
+        use_autoune = False
+        if "enable" in dataloader_config:
+            if isinstance(dataloader_config['enable'], bool):
+                use_autoune = dataloader_config['enable']
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the dataloader is incorrect."
+                    "The `enable` should be bool. Use default parameter instead."
+                )
+        if "tuning_steps" in dataloader_config:
+            if isinstance(dataloader_config['tuning_steps'], int):
+                paddle.fluid.reader.set_autotune_config(
+                    use_autoune, dataloader_config['tuning_steps'])
+            else:
+                warnings.warn(
+                    "The auto-tuning configuration of the dataloader is incorrect."
+                    "The `tuning_steps` should be int. Use default parameter instead."
+                )
+                paddle.fluid.reader.set_autotune_config(use_autoune)
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 74b5398230dee..4d40a477ffc07 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -38,6 +38,7 @@ def __init__(self,
                  is_grad_scaled_by_nranks=True,
                  alignment=128,
                  use_master_param_norm=True,
+                 gradient_accumulation_steps=1,
                  name=None):
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
@@ -63,6 +64,9 @@ def __init__(self,
         self._scale = None
         self._ring_id = 0
         self._use_master_param_norm = use_master_param_norm
+        self._gradient_accumulation_steps = gradient_accumulation_steps
+        assert self._gradient_accumulation_steps >= 1
+
         self.helper = LayerHelper('distributed_fused_lamb')
         self._supports_check_nan_inf = True  # very import flag for AMP
 
@@ -73,8 +77,19 @@ def __init__(self,
             dtype=core.VarDesc.VarType.BOOL)
         self._step = None
 
+        if self._gradient_accumulation_steps > 1:
+            self._stop_update = main_block.create_var(
+                name=unique_name.generate('stop_update'),
+                shape=[1],
+                dtype=core.VarDesc.VarType.BOOL)
+        else:
+            self._stop_update = None
+
         self._param_to_master_param = {}
 
+    def _get_stop_update_var(self):
+        return self._stop_update if self._stop_update is not None else False
+
     def _set_step(self, step):
         self._step = step
 
@@ -194,6 +209,20 @@ def _apply_gradients_impl(self, params_grads):
         param_order = self._create_persistable_var('param_order', dtype='int32')
         param_order.is_distributed = True
 
+        if self._gradient_accumulation_steps > 1:
+            fp32_acc_fused_grad = [
+                self._create_persistable_var('fp32_acc_fused_grad')
+            ]
+            fp16_acc_fused_grad = [
+                self._create_persistable_var(
+                    'fp16_acc_fused_grad', dtype='float16')
+            ]
+            acc_step = [self._create_persistable_var('acc_step', dtype='int64')]
+        else:
+            fp32_acc_fused_grad = []
+            fp16_acc_fused_grad = []
+            acc_step = []
+
         step = self._get_or_create_step()
 
         rank = get_rank()
@@ -298,6 +327,11 @@ def _apply_gradients_impl(self, params_grads):
                 'ParamOut': params,
                 'GradOut': grads,
                 'FoundInf': [self._found_inf],
+                'FP32AccFusedGrad': fp32_acc_fused_grad,
+                'FP16AccFusedGrad': fp16_acc_fused_grad,
+                'AccStep': acc_step,
+                'StopUpdate': self._stop_update
+                if self._stop_update is not None else [],
                 'Step': [step],
             },
             attrs={
@@ -311,5 +345,6 @@ def _apply_gradients_impl(self, params_grads):
                 'ring_id': self._ring_id,
                 'use_master_param_norm': self._use_master_param_norm,
                 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks,
+                'acc_steps': self._gradient_accumulation_steps,
             })
         return [lamb_op]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 34acbfbf75463..e64efda7b33bf 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -112,7 +112,10 @@ def elu(x, alpha=1.0, name=None):
             #  [ 1.          15.6      ]]
     """
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_elu(x, alpha)
+
+    if _in_legacy_dygraph():
         return _C_ops.elu(x, 'alpha', alpha)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 907fd4e6252c6..fe37b8fb97c3d 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1633,14 +1633,14 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
             #[[[0.03333334 0.93333334 0.03333334]
             #  [0.93333334 0.03333334 0.93333334]]]
     """
+    if epsilon > 1. or epsilon < 0.:
+        raise ValueError("The value of epsilon must be between 0 and 1.")
+
     if in_dygraph_mode():
         return _C_ops.final_state_label_smooth(label, prior_dist,
                                                float(epsilon))
 
-    if epsilon > 1. or epsilon < 0.:
-        raise ValueError("The value of epsilon must be between 0 and 1.")
-
-    if paddle.in_dynamic_mode():
+    elif paddle.in_dynamic_mode():
         return _C_ops.label_smooth(label, prior_dist, 'epsilon', float(epsilon))
 
     check_variable_and_dtype(label, 'label', ['float32', 'float64'],
diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py
index c7cb1052d2f78..9c84b01ecb9af 100644
--- a/python/paddle/nn/initializer/dirac.py
+++ b/python/paddle/nn/initializer/dirac.py
@@ -168,14 +168,22 @@ def __call__(self, var, block=None):
                 idx_list.append(offset)
         if framework.in_dygraph_mode():
             with fluid.dygraph.no_grad():
-                tmp_out = _C_ops.reshape(out_var, 'shape', [-1])
+                tmp_out, _ = _C_ops.reshape2(out_var, None, 'shape', [-1])
                 tmp_out._share_underline_tensor_to(out_var)
         else:
+            x_shape = block.create_var(
+                name=unique_name.generate(".".join([out_var.name, "XShape"])),
+                dtype=out_var.dtype,
+                shape=out_var.shape,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
             block.append_op(
-                type="reshape",
+                type="reshape2",
                 inputs={"X": out_var},
                 attrs={'shape': [-1]},
-                outputs={"Out": out_var},
+                outputs={"Out": out_var,
+                         "XShape": x_shape},
                 stop_gradient=True)
 
         index_tensor = block.create_var(
@@ -229,7 +237,8 @@ def __call__(self, var, block=None):
                 tmp_out = _C_ops.final_state_scatter(out_var, index_tensor,
                                                      value_tensor, True)
                 tmp_out._share_underline_tensor_to(out_var)
-                tmp_reshape_out = _C_ops.reshape(out_var, 'shape', origin_shape)
+                tmp_reshape_out, _ = _C_ops.reshape2(out_var, None, 'shape',
+                                                     origin_shape)
                 tmp_reshape_out._share_underline_tensor_to(out_var)
                 if var.dtype != VarDesc.VarType.FP32:
                     tmp_cast_out = _C_ops.cast(out_var, 'in_dtype',
@@ -248,11 +257,19 @@ def __call__(self, var, block=None):
                 attrs={'overwrite': True},
                 outputs={"Out": out_var},
                 stop_gradient=True)
+            x_shape = block.create_var(
+                name=unique_name.generate(".".join([out_var.name, "XShape"])),
+                dtype=out_var.dtype,
+                shape=out_var.shape,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
             block.append_op(
-                type="reshape",
+                type="reshape2",
                 inputs={"X": out_var},
                 attrs={'shape': origin_shape},
-                outputs={"Out": out_var},
+                outputs={"Out": out_var,
+                         "XShape": x_shape},
                 stop_gradient=True)
             if var.dtype != VarDesc.VarType.FP32:
                 block.append_op(
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index b5daa290456e3..ae6e37a02751d 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -37,6 +37,10 @@
 from paddle.framework import core
 from paddle.static import default_startup_program
 from paddle.static import program_guard
+try:
+    from collections.abc import Sequence
+except:
+    from collections import Sequence
 
 __all__ = []
 
@@ -197,7 +201,7 @@ def _is_shape_sequence(seq):
             # TODO: Add check for the illegal
             if isinstance(seq, dict):
                 return True
-            return (isinstance(seq, collections.Sequence) and
+            return (isinstance(seq, Sequence) and
                     not isinstance(seq, six.string_types))
 
         class Shape(object):
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index dd11477532d24..713a611f9f39a 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -24,6 +24,10 @@
 from paddle import _C_ops
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layer_helper import LayerHelper
+from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+import collections
+import string
+import opt_einsum
 
 from paddle.common_ops_import import dygraph_only
 
@@ -664,7 +668,138 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
     return plan
 
 
+def preprocess(equation, *operands):
+    """
+    check equation / raise error, default right labels generation
+    """
+    equation = equation.replace(" ", "")
+    nop = len(operands)
+    assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
+
+    # Part the equation to left hand side and right hand side
+    lhs, *rhs = equation.lower().split('->')
+    assert len(rhs) < 2, "Invalid equation: multiple `->` were found."
+
+    labels = parse_labels(lhs, operands)
+    # Note, we distinguish between 'ij->' and 'ij' by setting rhs to '' and None
+    rhs = rhs[0] if rhs else None
+    if rhs is None:
+        rhs = rhs_inference(lhs)
+
+    assert len(lhs.split(',')) == len(operands), (
+        f"Invalid equation: the number of operands is {len(operands)}, "
+        f"but found {len(lhs.split(','))} segments in the label equation.")
+
+    assert not ('...' in lhs and '...' not in rhs
+                ), f'Invalid equation: missing ellipsis in output labels.'
+
+    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
+                ), f'Duplicate labels are not supported.'
+
+    assert not has_duplicated_labels(
+        rhs), f'Invalid equation: duplicate output labels are found.'
+
+    return lhs, rhs, labels
+
+
+def parse_fake_shape(equation, operands, labels):
+    """ 
+    this shape is just used for operands planning. may differ with the original shape.
+    for example: 
+    ... is replaced by 1
+    -1  is replaced by 1
+    Results
+    -------
+    list of shape
+    """
+    shaped = collections.namedtuple('shaped', ['shape'])
+
+    def fake_shape(label, op):
+        assert len(op.shape) == len(
+            label
+        ), "length of shape and length of label must be the same, but received %d != %d" % (
+            len(op.shape), len(label))
+        fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
+        fakes = list(map(abs, fakes))  # make -1 -> 1
+        if '.' in label:
+            fakes.insert(label.index('.'), 1)
+        return shaped(fakes)
+
+    out = list(map(fake_shape, labels, operands))
+    return out
+
+
+def rhs_inference(lhs):
+    def is_free(key):
+        return cnt.get(key) == 1 and key not in ['.', ',']
+
+    cnt = collections.Counter(lhs)
+    rhs = "..." if '...' in lhs else ""
+    rhs = rhs + "".join(filter(is_free, sorted(cnt.elements())))
+    return rhs
+
+
+def gen_equation_for_opteinsum(lhs, rhs):
+    """ 
+    1. gen rhs if rhs is None
+    2. '...' -> 'A'
+    """
+
+    def get_used_label(counter):
+        used = set(counter.elements())
+        for c in string.ascii_lowercase:
+            if c not in used: return c
+        raise ValueError(
+            "You have used all `a` - `z`, there can't find a unused for einsum optimization"
+        )
+
+    cnt = collections.Counter(lhs)
+    broadcast_label = get_used_label(cnt)
+    if rhs is None:
+        rhs = rhs_inference(lhs)
+    lhs = lhs.replace("...", broadcast_label)
+    rhs = rhs.replace("...", broadcast_label)
+    return lhs + "->" + rhs, broadcast_label
+
+
 def einsum_v2(equation, *operands):
+    """ 
+    einsum v2 implementation.
+    1. Implement C++ EinsumOp.
+    2. V2 create the EinsumOp to calculate, so just a little verifty work in python.
+    3. V2 use opt_einsum.contract_path to optimize the multivariable einsum.
+    """
+    n_op = len(operands)
+    lhs, rhs, labels = preprocess(equation, *operands)
+
+    if n_op <= 2:
+        return gen_einsum_op(lhs + '->' + rhs, *operands)
+
+    shapes = parse_fake_shape(lhs, operands, labels)
+    opt_equation, broadcast_label = gen_equation_for_opteinsum(lhs, rhs)
+    _, cons = opt_einsum.contract_path(opt_equation, *shapes, einsum_call=True)
+    var_list = list(operands)
+    for path in cons:
+        (a, b), _, eq, *__ = path
+        assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
+        var_s = [var_list.pop(a), var_list.pop(b)]
+        eq = eq.replace(broadcast_label, "...")
+        var_list.append(gen_einsum_op(eq, *var_s))
+    assert len(
+        var_list
+    ) == 1, "There must be one elements in list, but received %d." % len(
+        var_list)
+    return var_list[0]
+
+
+def gen_einsum_op(equation, *operands):
+    """ 
+    EinsumOp Python Interface: 
+    """
+    assert len(operands) <= 2, "Only support two operands in EinsumOp."
+    if in_dygraph_mode():
+        return _C_ops.final_state_einsum(operands, equation)
+
     if _in_legacy_dygraph():
         # dygraph
         return _C_ops.einsum(operands, 'equation', equation)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index b7b08af9e60bc..83501b0399492 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -4260,18 +4260,19 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
         if in_dygraph_mode():
-            input_back = input_front = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
+            input_back = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
                                             [])
         else:
             input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
                 'infer_flags', infer_flags, *attrs_2)
 
         if x.dtype == paddle.bool:
-            op = getattr(_C_ops, "logical_xor")
-            out = op(input_back, input_front)
+            if in_dygraph_mode():
+                return _C_ops.final_state_logical_xor(input_back, input_front)
+            else:
+                return _C_ops.logical_xor(input_back, input_front)
         else:
-            out = elementwise_sub(input_back, input_front, axis=axis)
-        return out
+            return elementwise_sub(input_back, input_front, axis=axis)
 
     else:
         check_variable_and_dtype(x, 'x', ['float32', 'float64', 'bool', 'int32', 'int64'], 'diff')
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 974943a99d8b4..242680bc7c738 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -175,6 +175,12 @@ def test_random_crop(self):
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
         img = trans_random_crop_pad(img)
 
+    def test_erase(self):
+        trans = transforms.Compose([
+            transforms.RandomErasing(), transforms.RandomErasing(value="random")
+        ])
+        self.do_transform(trans)
+
     def test_grayscale(self):
         trans = transforms.Compose([transforms.Grayscale()])
         self.do_transform(trans)
@@ -299,6 +305,24 @@ def test_exception(self):
         with self.assertRaises(NotImplementedError):
             transform = transforms.BrightnessTransform('0.1', keys='a')
 
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(scale=0.5)
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(ratio=0.8)
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(scale=(10, 0.4))
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(ratio=(3.3, 0.3))
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(prob=1.5)
+
+        with self.assertRaises(Exception):
+            transform = transforms.RandomErasing(value="0")
+
     def test_info(self):
         str(transforms.Compose([transforms.Resize((224, 224))]))
         str(transforms.Compose([transforms.Resize((224, 224))]))
@@ -355,6 +379,10 @@ def test_normalize(self):
         trans = transforms.Compose([normalize])
         self.do_transform(trans)
 
+    def test_color_jitter(self):
+        trans = transforms.Compose([transforms.ColorJitter(1.1, 2.2, 0.8, 0.1)])
+        self.do_transform(trans)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -398,6 +426,13 @@ def test_random_crop(self):
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
         img = trans_random_crop_pad(img)
 
+    def test_erase(self):
+        trans = transforms.Compose([
+            transforms.RandomErasing(value=(0.5, )),
+            transforms.RandomErasing(value="random")
+        ])
+        self.do_transform(trans)
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
@@ -562,6 +597,59 @@ def test_center_crop(self):
             tensor_cropped_img.numpy().transpose((1, 2, 0)),
             decimal=4)
 
+    def test_color_jitter_sub_function(self):
+        np.random.seed(555)
+        np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(np_img)
+        np_img = pil_img
+
+        np_img_gray = (np.random.rand(28, 28, 1) * 255).astype('uint8')
+        tensor_img_gray = F.to_tensor(np_img_gray)
+
+        places = ['cpu']
+        if paddle.device.is_compiled_with_cuda():
+            places.append('gpu')
+
+        def test_adjust_brightness(np_img, tensor_img):
+            result_cv2 = np.array(F.adjust_brightness(np_img, 1.2))
+            result_tensor = F.adjust_brightness(tensor_img, 1.2).numpy()
+            result_tensor = np.transpose(result_tensor * 255,
+                                         (1, 2, 0)).astype('uint8')
+            np.testing.assert_equal(result_cv2, result_tensor)
+
+        # For adjust_contrast / adjust_saturation / adjust_hue the implement is kind
+        # of different between PIL and Tensor. So the results can not equal exactly.
+
+        def test_adjust_contrast(np_img, tensor_img):
+            result_pil = np.array(F.adjust_contrast(np_img, 0.36))
+            result_tensor = F.adjust_contrast(tensor_img, 0.36).numpy()
+            result_tensor = np.transpose(result_tensor * 255, (1, 2, 0))
+            diff = np.max(np.abs(result_tensor - result_pil))
+            self.assertTrue(diff < 1.1)
+
+        def test_adjust_saturation(np_img, tensor_img):
+            result_pil = np.array(F.adjust_saturation(np_img, 1.0))
+            result_tensor = F.adjust_saturation(tensor_img, 1.0).numpy()
+            result_tensor = np.transpose(result_tensor * 255., (1, 2, 0))
+            diff = np.max(np.abs(result_tensor - result_pil))
+            self.assertTrue(diff < 1.1)
+
+        def test_adjust_hue(np_img, tensor_img):
+            result_pil = np.array(F.adjust_hue(np_img, 0.45))
+            result_tensor = F.adjust_hue(tensor_img, 0.45).numpy()
+            result_tensor = np.transpose(result_tensor * 255, (1, 2, 0))
+            diff = np.max(np.abs(result_tensor - result_pil))
+            self.assertTrue(diff <= 16.0)
+
+        for place in places:
+            paddle.set_device(place)
+
+            test_adjust_brightness(np_img, tensor_img)
+            test_adjust_contrast(np_img, tensor_img)
+            test_adjust_saturation(np_img, tensor_img)
+            test_adjust_hue(np_img, tensor_img)
+
     def test_pad(self):
         np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
@@ -637,6 +725,47 @@ def test_to_tensor(self):
         pil_img = Image.fromarray(np_img).convert('YCbCr')
         pil_tensor = F.to_tensor(pil_img)
 
+    def test_erase(self):
+        np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+
+        expected = np_img.copy()
+        expected[10:15, 10:15, :] = 0
+
+        F.erase(np_img, 10, 10, 5, 5, 0, inplace=True)
+        np.testing.assert_equal(np_img, expected)
+
+        pil_result = F.erase(pil_img, 10, 10, 5, 5, 0)
+        np.testing.assert_equal(np.array(pil_result), expected)
+
+        np_data = np.random.rand(3, 28, 28).astype('float32')
+        places = ['cpu']
+        if paddle.device.is_compiled_with_cuda():
+            places.append('gpu')
+        for place in places:
+            paddle.set_device(place)
+            tensor_img = paddle.to_tensor(np_data)
+            expected_tensor = tensor_img.clone()
+            expected_tensor[:, 10:15, 10:15] = paddle.to_tensor([0.88])
+
+            tensor_result = F.erase(tensor_img, 10, 10, 5, 5,
+                                    paddle.to_tensor([0.88]))
+            np.testing.assert_equal(tensor_result.numpy(),
+                                    expected_tensor.numpy())
+
+    def test_erase_backward(self):
+        img = paddle.randn((3, 14, 14), dtype=np.float32)
+        img.stop_gradient = False
+        erased = F.erase(
+            img, 3, 3, 5, 5, paddle.ones(
+                (1, 1, 1), dtype='float32'))
+        loss = erased.sum()
+        loss.backward()
+
+        expected_grad = np.ones((3, 14, 14), dtype=np.float32)
+        expected_grad[:, 3:8, 3:8] = 0.
+        np.testing.assert_equal(img.grad.numpy(), expected_grad)
+
     def test_image_load(self):
         fake_img = Image.fromarray((np.random.random((32, 32, 3)) * 255).astype(
             'uint8'))
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index d401e7c5190fe..f078aae9bb6b1 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -466,6 +466,7 @@
     func : DeformableConvInferMeta
   kernel :
     func : deformable_conv
+    data_type : x
   optional : mask
   backward : deformable_conv_grad
 
@@ -546,6 +547,7 @@
     func : DropoutInferMeta
   kernel :
     func : dropout
+    data_type : x
   optional : seed_tensor
   backward : dropout_grad
 
@@ -559,6 +561,16 @@
     func : eigh
   backward : eigh_grad
 
+- api : einsum
+  args : (Tensor[] x, str equation)
+  output : Tensor
+  infer_meta :
+    func : EinsumInferMeta
+    param : [x, equation]
+  kernel :
+    func : einsum
+  backward : einsum_grad
+
 - api : elementwise_pow
   args : (Tensor x, Tensor y)
   output : Tensor(out)
@@ -1065,6 +1077,7 @@
     func : LayerNormInferMeta
   kernel :
     func : layer_norm
+    data_type : x
   backward : layer_norm_grad
   optional : scale, bias
 
@@ -1608,6 +1621,7 @@
     func : PsroiPoolInferMeta
   kernel :
     func : psroi_pool
+    data_type : x
   optional : boxes_num
   backward : psroi_pool_grad
 
@@ -1713,6 +1727,7 @@
     func : RoiAlignInferMeta
   kernel :
     func : roi_align
+    data_type : x
   optional : boxes_num
   backward : roi_align_grad
 
@@ -1723,6 +1738,7 @@
     func : RoiPoolInferMeta
   kernel :
     func : roi_pool
+    data_type : x
   optional : boxes_num
   intermediate : arg_max
   backward : roi_pool_grad
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 3b47470139b90..e044447f87c22 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -152,6 +152,18 @@
   kernel :
     func : atanh_grad
 
+- backward_api : batch_norm_double_grad
+  forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
+  args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out,  Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, x]
+  kernel :
+    func : batch_norm_grad_grad
+    data_type : x
+  optional : out_mean, out_variance
+
 - backward_api : batch_norm_grad
   forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
@@ -163,6 +175,7 @@
     func : batch_norm_grad
     data_type : out_grad
   optional : mean_out, variance_out, reserve_space
+  backward : batch_norm_double_grad
 
 - backward_api : bce_loss_grad
   forward : bce_loss (Tensor input, Tensor label) -> Tensor(out)
@@ -362,6 +375,7 @@
     func : DeformableConvGradInferMeta
   kernel :
     func : deformable_conv_grad
+    data_type : x
   optional : mask
 
 - backward_api : depthwise_conv2d_transpose_grad
@@ -414,6 +428,18 @@
   kernel :
     func : dist_grad
 
+- backward_api : divide_double_grad
+  forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
+  args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [y, grad_x, grad_x]
+  kernel :
+    func : divide_double_grad
+    data_type : out
+  optional : grad_x_grad, grad_y_grad
+
 - backward_api : divide_grad
   forward : divide (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, int axis = -1)
@@ -423,6 +449,7 @@
     param : [x, y]
   kernel :
     func : divide_grad
+  backward : divide_double_grad
 
 - backward_api : dropout_grad
   forward : dropout (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask)
@@ -444,6 +471,19 @@
     param : [out_v]
   kernel :
     func : eigh_grad
+    data_type : out_v
+  data_transform:
+    skip_transform : out_w, out_w_grad
+
+- backward_api : einsum_grad
+  forward : einsum (Tensor[] x, str equation) -> Tensor(out)
+  args : (Tensor[] x, Tensor out_grad, str equation)
+  output : Tensor[](x_grad){x.size()}
+  infer_meta :
+    func : UnchangedMultiInferMeta
+    param : [x]
+  kernel :
+    func : einsum_grad
 
 - backward_api : elementwise_pow_grad
   forward : elementwise_pow(Tensor x, Tensor y) -> Tensor(out)
@@ -455,6 +495,16 @@
   kernel :
     func : elementwise_pow_grad
 
+- backward_api : elu_double_grad
+  forward : elu_grad (Tensor x, Tensor out, Tensor grad_out, float alpha)-> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
+  output : Tensor(x_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, x]
+  kernel :
+    func : elu_double_grad
+
 - backward_api : elu_grad
   forward : elu (Tensor x, float alpha) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, float alpha)
@@ -464,6 +514,7 @@
     param : [x]
   kernel :
     func : elu_grad
+  backward : elu_double_grad
 
 - backward_api : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
@@ -633,6 +684,7 @@
     param : [x]
   kernel :
     func : graph_send_recv_grad
+    data_type : out_grad
   optional: out, dst_count
 
 - backward_api : gumbel_softmax_grad
@@ -1287,6 +1339,7 @@
     param : [x]
   kernel :
     func : psroi_pool_grad
+    data_type : x
   optional : boxes_num
 
 # output is optional
@@ -1381,6 +1434,7 @@
     param : [x]
   kernel :
     func : roi_align_grad
+    data_type : boxes
   optional : boxes_num
 
 - backward_api : roi_pool_grad
@@ -1392,6 +1446,7 @@
     param : [x]
   kernel :
     func : roi_pool_grad
+    data_type : x
   optional : boxes_num
 
 - backward_api : roll_grad
@@ -1498,7 +1553,7 @@
     func : UnchangedInferMeta
     param : [x]
   kernel :
-    func : sigmoid_cross_entropy_with_logits_grad 
+    func : sigmoid_cross_entropy_with_logits_grad
 
 - backward_api : sigmoid_double_grad
   forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
@@ -1654,6 +1709,18 @@
     func : strided_slice_grad
   no_need_buffer : x
 
+- backward_api : subtract_double_grad
+  forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
+  args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [grad_out]
+  kernel :
+    func : subtract_double_grad
+  optional : grad_x_grad, grad_y_grad
+  no_need_buffer : y, grad_out
+
 - backward_api : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
@@ -1664,6 +1731,7 @@
   kernel :
     func : subtract_grad
   no_need_buffer : x, y
+  backward : subtract_double_grad
 
 - backward_api : sum_double_grad
   forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
@@ -1720,6 +1788,17 @@
   kernel :
     func : tan_grad
 
+- backward_api : tanh_double_grad
+  forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor out, Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(out_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [out, out]
+  kernel :
+    func : tanh_double_grad
+  backward : tanh_triple_grad
+
 - backward_api : tanh_grad
   forward : tanh (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
@@ -1729,6 +1808,7 @@
     param : [out]
   kernel :
     func : tanh_grad
+  backward : tanh_double_grad
 
 - backward_api : tanh_shrink_grad
   forward : tanh_shrink (Tensor x) -> Tensor(out)
@@ -1740,6 +1820,16 @@
   kernel :
     func : tanh_shrink_grad
 
+- backward_api : tanh_triple_grad
+  forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
+  args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad)
+  output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [out, out, grad_x_grad_forward]
+  kernel :
+    func : tanh_triple_grad
+
 - backward_api : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index 413f09f78699e..b255e663e6876 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -31,6 +31,7 @@
 from .transforms import RandomRotation  # noqa: F401
 from .transforms import Grayscale  # noqa: F401
 from .transforms import ToTensor  # noqa: F401
+from .transforms import RandomErasing  # noqa: F401
 from .functional import to_tensor  # noqa: F401
 from .functional import hflip  # noqa: F401
 from .functional import vflip  # noqa: F401
@@ -44,6 +45,7 @@
 from .functional import adjust_contrast  # noqa: F401
 from .functional import adjust_hue  # noqa: F401
 from .functional import normalize  # noqa: F401
+from .functional import erase  # noqa: F401
 
 __all__ = [ #noqa
     'BaseTransform',
@@ -65,6 +67,7 @@
     'RandomRotation',
     'Grayscale',
     'ToTensor',
+    'RandomErasing',
     'to_tensor',
     'hflip',
     'vflip',
@@ -77,5 +80,6 @@
     'adjust_brightness',
     'adjust_contrast',
     'adjust_hue',
-    'normalize'
+    'normalize',
+    'erase',
 ]
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 8caab964bf87b..5a8c2cc09f884 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -370,13 +370,13 @@ def adjust_brightness(img, brightness_factor):
     """Adjusts brightness of an Image.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         brightness_factor (float): How much to adjust the brightness. Can be
             any non negative number. 0 gives a black image, 1 gives the
             original image while 2 increases the brightness by a factor of 2.
 
     Returns:
-        PIL.Image or np.array: Brightness adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Brightness adjusted image.
 
     Examples:
         .. code-block:: python
@@ -392,28 +392,31 @@ def adjust_brightness(img, brightness_factor):
             converted_img = F.adjust_brightness(fake_img, 0.4)
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_brightness(img, brightness_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_brightness(img, brightness_factor)
+    else:
+        return F_t.adjust_brightness(img, brightness_factor)
 
 
 def adjust_contrast(img, contrast_factor):
     """Adjusts contrast of an Image.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         contrast_factor (float): How much to adjust the contrast. Can be any
             non negative number. 0 gives a solid gray image, 1 gives the
             original image while 2 increases the contrast by a factor of 2.
 
     Returns:
-        PIL.Image or np.array: Contrast adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Contrast adjusted image.
 
     Examples:
         .. code-block:: python
@@ -429,28 +432,31 @@ def adjust_contrast(img, contrast_factor):
             converted_img = F.adjust_contrast(fake_img, 0.4)
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_contrast(img, contrast_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_contrast(img, contrast_factor)
+    else:
+        return F_t.adjust_contrast(img, contrast_factor)
 
 
 def adjust_saturation(img, saturation_factor):
     """Adjusts color saturation of an image.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         saturation_factor (float):  How much to adjust the saturation. 0 will
             give a black and white image, 1 will give the original image while
             2 will enhance the saturation by a factor of 2.
 
     Returns:
-        PIL.Image or np.array: Saturation adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Saturation adjusted image.
 
     Examples:
         .. code-block:: python
@@ -467,15 +473,18 @@ def adjust_saturation(img, saturation_factor):
             print(converted_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_saturation(img, saturation_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_saturation(img, saturation_factor)
+    else:
+        return F_t.adjust_saturation(img, saturation_factor)
 
 
 def adjust_hue(img, hue_factor):
@@ -489,7 +498,7 @@ def adjust_hue(img, hue_factor):
     interval `[-0.5, 0.5]`.
 
     Args:
-        img (PIL.Image|np.array): Image to be adjusted.
+        img (PIL.Image|np.array|paddle.Tensor): Image to be adjusted.
         hue_factor (float):  How much to shift the hue channel. Should be in
             [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
             HSV space in positive and negative direction respectively.
@@ -497,7 +506,7 @@ def adjust_hue(img, hue_factor):
             with complementary colors while 0 gives the original image.
 
     Returns:
-        PIL.Image or np.array: Hue adjusted image.
+        PIL.Image|np.array|paddle.Tensor: Hue adjusted image.
 
     Examples:
         .. code-block:: python
@@ -514,15 +523,18 @@ def adjust_hue(img, hue_factor):
             print(converted_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.adjust_hue(img, hue_factor)
-    else:
+    elif _is_numpy_image(img):
         return F_cv2.adjust_hue(img, hue_factor)
+    else:
+        return F_t.adjust_hue(img, hue_factor)
 
 
 def rotate(img,
@@ -677,3 +689,39 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
             img = np.array(img).astype(np.float32)
 
         return F_cv2.normalize(img, mean, std, data_format, to_rgb)
+
+
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input image with given value.
+    
+        Args:
+            img (paddle.Tensor | np.array | PIL.Image): input Tensor image. 
+                 For Tensor input, the shape should be (C, H, W). For np.array input, 
+                 the shape should be (H, W, C).
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (paddle.Tensor | np.array): value used to replace the pixels in erased region. It 
+                should be np.array when img is np.array or PIL.Image.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            paddle.Tensor | np.array | PIL.Image: Erased image. The type is same with input image.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                
+                fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
+                values = paddle.zeros((1,1,1), dtype=paddle.float32)
+                result = paddle.vision.transforms.erase(fake_img, 4, 4, 3, 3, values)
+
+    """
+    if _is_tensor_image(img):
+        return F_t.erase(img, i, j, h, w, v, inplace=inplace)
+    elif _is_pil_image(img):
+        return F_pil.erase(img, i, j, h, w, v, inplace=inplace)
+    else:
+        return F_cv2.erase(img, i, j, h, w, v, inplace=inplace)
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 38b50898be606..8343a8c340ffb 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -564,3 +564,26 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
 
     img = (img - mean) / std
     return img
+
+
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input image array with given value.
+
+       Args:
+            img (np.array): input image array, which shape is (H, W, C).
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (np.array): value used to replace the pixels in erased region.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            np.array: Erased image.
+        
+    """
+    if not inplace:
+        img = img.copy()
+
+    img[i:i + h, j:j + w, ...] = v
+    return img
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index b3ff37d7ea3bb..71f7759f11b66 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -32,14 +32,25 @@
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
-_pil_interp_from_str = {
-    'nearest': Image.NEAREST,
-    'bilinear': Image.BILINEAR,
-    'bicubic': Image.BICUBIC,
-    'box': Image.BOX,
-    'lanczos': Image.LANCZOS,
-    'hamming': Image.HAMMING
-}
+try:
+    # PIL version >= "9.1.0"
+    _pil_interp_from_str = {
+        'nearest': Image.Resampling.NEAREST,
+        'bilinear': Image.Resampling.BILINEAR,
+        'bicubic': Image.Resampling.BICUBIC,
+        'box': Image.Resampling.BOX,
+        'lanczos': Image.Resampling.LANCZOS,
+        'hamming': Image.Resampling.HAMMING
+    }
+except:
+    _pil_interp_from_str = {
+        'nearest': Image.NEAREST,
+        'bilinear': Image.BILINEAR,
+        'bicubic': Image.BICUBIC,
+        'box': Image.BOX,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
 
 __all__ = []
 
@@ -469,3 +480,26 @@ def to_grayscale(img, num_output_channels=1):
         raise ValueError('num_output_channels should be either 1 or 3')
 
     return img
+
+
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input image with given value. PIL format is
+        not support inplace.
+
+       Args:
+            img (PIL.Image): input image, which shape is (C, H, W).
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (np.array): value used to replace the pixels in erased region.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            PIL.Image: Erased image.
+        
+    """
+    np_img = np.array(img, dtype=np.uint8)
+    np_img[i:i + h, j:j + w, ...] = v
+    img = Image.fromarray(np_img, 'RGB')
+    return img
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 5e5cf465425ed..2e276883cd376 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -86,6 +86,68 @@ def _get_image_size(img, data_format):
         _get_image_h_axis(data_format)]
 
 
+def _rgb_to_hsv(img):
+    """Convert a image Tensor from RGB to HSV. This implementation is based on Pillow (
+            https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Convert.c)
+    """
+    maxc = img.max(axis=-3)
+    minc = img.min(axis=-3)
+
+    is_equal = paddle.equal(maxc, minc)
+    one_divisor = paddle.ones_like(maxc)
+    c_delta = maxc - minc
+    # s is 0 when maxc == minc, set the divisor to 1 to avoid zero divide.
+    s = c_delta / paddle.where(is_equal, one_divisor, maxc)
+
+    r, g, b = img.unbind(axis=-3)
+    c_delta_divisor = paddle.where(is_equal, one_divisor, c_delta)
+    # when maxc == minc, there is r == g == b, set the divisor to 1 to avoid zero divide.
+    rc = (maxc - r) / c_delta_divisor
+    gc = (maxc - g) / c_delta_divisor
+    bc = (maxc - b) / c_delta_divisor
+
+    hr = (maxc == r).astype(maxc.dtype) * (bc - gc)
+    hg = ((maxc == g) & (maxc != r)).astype(maxc.dtype) * (rc - bc + 2.0)
+    hb = ((maxc != r) & (maxc != g)).astype(maxc.dtype) * (gc - rc + 4.0)
+    h = (hr + hg + hb) / 6.0 + 1.0
+    h = h - h.trunc()
+    return paddle.stack([h, s, maxc], axis=-3)
+
+
+def _hsv_to_rgb(img):
+    """Convert a image Tensor from HSV to RGB.
+    """
+    h, s, v = img.unbind(axis=-3)
+    f = h * 6.0
+    i = paddle.floor(f)
+    f = f - i
+    i = i.astype(paddle.int32) % 6
+
+    p = paddle.clip(v * (1.0 - s), 0.0, 1.0)
+    q = paddle.clip(v * (1.0 - s * f), 0.0, 1.0)
+    t = paddle.clip(v * (1.0 - s * (1.0 - f)), 0.0, 1.0)
+
+    mask = paddle.equal(
+        i.unsqueeze(axis=-3),
+        paddle.arange(
+            6, dtype=i.dtype).reshape((-1, 1, 1))).astype(img.dtype)
+    matrix = paddle.stack(
+        [
+            paddle.stack(
+                [v, q, p, p, t, v], axis=-3), paddle.stack(
+                    [t, v, v, q, p, p], axis=-3), paddle.stack(
+                        [p, p, t, v, v, q], axis=-3)
+        ],
+        axis=-4)
+    return paddle.einsum("...ijk, ...xijk -> ...xjk", mask, matrix)
+
+
+def _blend_images(img1, img2, ratio):
+    max_value = 1.0 if paddle.is_floating_point(img1) else 255.0
+    return paddle.lerp(img2, img1, float(ratio)).clip(
+        0, max_value).astype(img1.dtype)
+
+
 def normalize(img, mean, std, data_format='CHW'):
     """Normalizes a tensor image given mean and standard deviation.
 
@@ -354,6 +416,30 @@ def crop(img, top, left, height, width, data_format='CHW'):
         return img[top:top + height, left:left + width, :]
 
 
+def erase(img, i, j, h, w, v, inplace=False):
+    """Erase the pixels of selected area in input Tensor image with given value.
+
+       Args:
+            img (paddle.Tensor): input Tensor image.
+            i (int): y coordinate of the top-left point of erased region.
+            j (int): x coordinate of the top-left point of erased region.
+            h (int): Height of the erased region.
+            w (int): Width of the erased region.
+            v (paddle.Tensor): value used to replace the pixels in erased region.
+            inplace (bool, optional): Whether this transform is inplace. Default: False.
+
+        Returns:
+            paddle.Tensor: Erased image.
+        
+    """
+    _assert_image_tensor(img, 'CHW')
+    if not inplace:
+        img = img.clone()
+
+    img[..., i:i + h, j:j + w] = v
+    return img
+
+
 def center_crop(img, output_size, data_format='CHW'):
     """Crops the given paddle.Tensor Image and resize it to desired size.
 
@@ -514,3 +600,127 @@ def resize(img, size, interpolation='bilinear', data_format='CHW'):
         data_format='N' + data_format.upper())
 
     return img.squeeze(0)
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjusts brightness of an Image.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        brightness_factor (float): How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        paddle.Tensor: Brightness adjusted image.
+
+    """
+    _assert_image_tensor(img, 'CHW')
+    assert brightness_factor >= 0, "brightness_factor should be non-negative."
+    assert _get_image_num_channels(
+        img, 'CHW') in [1, 3], "channels of input should be either 1 or 3."
+
+    extreme_target = paddle.zeros_like(img, img.dtype)
+    return _blend_images(img, extreme_target, brightness_factor)
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjusts contrast of an image.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        paddle.Tensor: Contrast adjusted image.
+
+    """
+    _assert_image_tensor(img, 'chw')
+    assert contrast_factor >= 0, "contrast_factor should be non-negative."
+
+    channels = _get_image_num_channels(img, 'CHW')
+    dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
+    if channels == 1:
+        extreme_target = paddle.mean(
+            img.astype(dtype), axis=(-3, -2, -1), keepdim=True)
+    elif channels == 3:
+        extreme_target = paddle.mean(
+            to_grayscale(img).astype(dtype), axis=(-3, -2, -1), keepdim=True)
+    else:
+        raise ValueError("channels of input should be either 1 or 3.")
+
+    return _blend_images(img, extreme_target, contrast_factor)
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjusts color saturation of an image.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        paddle.Tensor: Saturation adjusted image.
+
+    """
+    _assert_image_tensor(img, 'CHW')
+    assert saturation_factor >= 0, "saturation_factor should be non-negative."
+    channels = _get_image_num_channels(img, 'CHW')
+    if channels == 1:
+        return img
+    elif channels == 3:
+        extreme_target = to_grayscale(img)
+    else:
+        raise ValueError("channels of input should be either 1 or 3.")
+
+    return _blend_images(img, extreme_target, saturation_factor)
+
+
+def adjust_hue(img, hue_factor):
+    """Adjusts hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Args:
+        img (paddle.Tensor): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        paddle.Tensor: Hue adjusted image.
+
+    """
+    _assert_image_tensor(img, 'CHW')
+    assert hue_factor >= -0.5 and hue_factor <= 0.5, "hue_factor should be in range [-0.5, 0.5]"
+    channels = _get_image_num_channels(img, 'CHW')
+    if channels == 1:
+        return img
+    elif channels == 3:
+        dtype = img.dtype
+        if dtype == paddle.uint8:
+            img = img.astype(paddle.float32) / 255.0
+
+        img_hsv = _rgb_to_hsv(img)
+        h, s, v = img_hsv.unbind(axis=-3)
+        h = (h + hue_factor)
+        h = h - h.floor()
+        img_adjusted = _hsv_to_rgb(paddle.stack([h, s, v], axis=-3))
+
+        if dtype == paddle.uint8:
+            img_adjusted = (img_adjusted * 255.0).astype(dtype)
+    else:
+        raise ValueError("channels of input should be either 1 or 3.")
+
+    return img_adjusted
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index a22f8a2ab4049..828a0d9b0936d 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -25,6 +25,7 @@
 import warnings
 import traceback
 
+import paddle
 from paddle.utils import try_import
 from . import functional as F
 
@@ -1342,3 +1343,143 @@ def _apply_image(self, img):
             PIL Image: Randomly grayscaled image.
         """
         return F.to_grayscale(img, self.num_output_channels)
+
+
+class RandomErasing(BaseTransform):
+    """Erase the pixels in a rectangle region selected randomly.
+
+    Args:
+        prob (float, optional): Probability of the input data being erased. Default: 0.5.
+        scale (sequence, optional): The proportional range of the erased area to the input image. 
+                                    Default: (0.02, 0.33).
+        ratio (sequence, optional): Aspect ratio range of the erased area. Default: (0.3, 3.3).
+        value (int|float|sequence|str, optional): The value each pixel in erased area will be replaced with.
+                               If value is a single number, all pixels will be erased with this value. 
+                               If value is a sequence with length 3, the R, G, B channels will be ereased 
+                               respectively. If value is set to "random", each pixel will be erased with 
+                               random values. Default: 0.
+        inplace (bool, optional): Whether this transform is inplace. Default: False.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+    
+    Shape:
+        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W). 
+                 For np.array input, the shape should be (H, W, C).
+        - output(paddle.Tensor | np.array | PIL.Image): A random erased image.
+
+    Returns:
+        A callable object of RandomErasing.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            
+            fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
+            transform = paddle.vision.transforms.RandomErasing()
+            result = transform(fake_img) 
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 scale=(0.02, 0.33),
+                 ratio=(0.3, 3.3),
+                 value=0,
+                 inplace=False,
+                 keys=None):
+        super(RandomErasing, self).__init__(keys)
+        assert isinstance(scale,
+                          (tuple, list)), "scale should be a tuple or list"
+        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
+                ), "scale should be of kind (min, max) and in range [0, 1]"
+        assert isinstance(ratio,
+                          (tuple, list)), "ratio should be a tuple or list"
+        assert (ratio[0] >= 0 and
+                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        assert (prob >= 0 and
+                prob <= 1), "The probability should be in range [0, 1]"
+        assert isinstance(
+            value, (numbers.Number, str, tuple,
+                    list)), "value should be a number, tuple, list or str"
+        if isinstance(value, str) and value != "random":
+            raise ValueError("value must be 'random' when type is str")
+
+        self.prob = prob
+        self.scale = scale
+        self.ratio = ratio
+        self.value = value
+        self.inplace = inplace
+
+    def _get_param(self, img, scale, ratio, value):
+        """Get parameters for ``erase`` for a random erasing.
+
+        Args:
+            img (paddle.Tensor | np.array | PIL.Image): Image to be erased.
+            scale (sequence, optional): The proportional range of the erased area to the input image. 
+            ratio (sequence, optional): Aspect ratio range of the erased area.
+            value (sequence | None): The value each pixel in erased area will be replaced with.
+                               If value is a sequence with length 3, the R, G, B channels will be ereased 
+                               respectively. If value is None, each pixel will be erased with random values.
+
+        Returns:
+            tuple: params (i, j, h, w, v) to be passed to ``erase`` for random erase.
+        """
+        if F._is_pil_image(img):
+            shape = np.asarray(img).astype(np.uint8).shape
+            h, w, c = shape[-3], shape[-2], shape[-1]
+        elif F._is_numpy_image(img):
+            h, w, c = img.shape[-3], img.shape[-2], img.shape[-1]
+        elif F._is_tensor_image(img):
+            c, h, w = img.shape[-3], img.shape[-2], img.shape[-1]
+
+        img_area = h * w
+        log_ratio = np.log(ratio)
+        for _ in range(10):
+            erase_area = np.random.uniform(*scale) * img_area
+            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
+            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
+            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
+            if erase_h >= h or erase_w >= w:
+                continue
+            if F._is_tensor_image(img):
+                if value is None:
+                    v = paddle.normal(
+                        shape=[c, erase_h, erase_w]).astype(img.dtype)
+                else:
+                    v = paddle.to_tensor(value, dtype=img.dtype)[:, None, None]
+            else:
+                if value is None:
+                    v = np.random.normal(size=[erase_h, erase_w, c]) * 255
+                else:
+                    v = np.array(value)[None, None, :]
+            top = np.random.randint(0, h - erase_h + 1)
+            left = np.random.randint(0, w - erase_w + 1)
+
+            return top, left, erase_h, erase_w, v
+
+        return 0, 0, h, w, img
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (paddle.Tensor | np.array | PIL.Image): Image to be Erased.
+
+        Returns:
+            output (paddle.Tensor np.array | PIL.Image): A random erased image.
+        """
+
+        if random.random() < self.prob:
+            if isinstance(self.value, numbers.Number):
+                value = [self.value]
+            elif isinstance(self.value, str):
+                value = None
+            else:
+                value = self.value
+            if value is not None and not (len(value) == 1 or len(value) == 3):
+                raise ValueError(
+                    "Value should be a single number or a sequence with length equals to image's channel."
+                )
+            top, left, erase_h, erase_w, v = self._get_param(img, self.scale,
+                                                             self.ratio, value)
+            return F.erase(img, top, left, erase_h, erase_w, v, self.inplace)
+        return img
diff --git a/python/requirements.txt b/python/requirements.txt
index 5f2b788a81a0a..e7fc6cd651cb0 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -6,3 +6,4 @@ six
 decorator
 astor
 paddle_bfloat==0.1.2
+opt_einsum==3.3.0
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 45d4731ba1dba..630005bccbaf7 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -20,7 +20,7 @@ if [ -z ${BRANCH} ]; then
 fi
 
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 failed_num=0
 echo_list=()
 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b0800a9cd845e..b2d2e792c995b 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -71,7 +71,7 @@ API_FILES=("CMakeLists.txt"
            "paddle/fluid/eager/backward.h"
            )
 
-approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 git_files=`git diff --numstat upstream/$BRANCH| wc -l`
 git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
 failed_num=0
diff --git a/tools/check_ut.py b/tools/check_ut.py
index f5fe4c687dd78..fa50f5cc81f13 100644
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
@@ -24,7 +24,7 @@ class PRChecker(object):
     """ PR Checker. """
 
     def __init__(self):
-        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.github = Github(timeout=60)
         self.repo = None
 
     def check(self, filename, msg):
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 8e84eccc083f2..878660cefaf21 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -307,7 +307,7 @@ function gpu_op_benchmark {
 # The PR will pass quickly when get approval from specific person.
 # Xreki 12538138, luotao1 6836917, ZzSean 32410583
 set +x
-approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
+approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
   APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
diff --git a/tools/coverage/cuda_clean.py b/tools/coverage/cuda_clean.py
index 8c03edd078549..28142c869d04c 100644
--- a/tools/coverage/cuda_clean.py
+++ b/tools/coverage/cuda_clean.py
@@ -30,8 +30,7 @@ def get_pull(pull_id):
     Returns:
         github.PullRequest.PullRequest: The pull request.
     """
-    token = os.getenv('GITHUB_API_TOKEN')
-    github = Github(token, timeout=60)
+    github = Github(timeout=60)
     repo = github.get_repo('PaddlePaddle/Paddle')
     pull = repo.get_pull(pull_id)
 
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 12bd04a6907ea..33d9a8f6c78a3 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -32,8 +32,7 @@ def get_pull(pull_id):
     Returns:
         github.PullRequest.PullRequest
     """
-    token = os.getenv('GITHUB_API_TOKEN')
-    github = Github(token, timeout=60)
+    github = Github(timeout=60)
     idx = 1
     while idx < 4:
         try:
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index f3e88286ca965..20399f1c2e630 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -24,8 +24,6 @@
 
 from github import Github
 
-token = os.getenv('GITHUB_API_TOKEN')
-
 
 def get_pull(pull_id):
     """
@@ -35,7 +33,7 @@ def get_pull(pull_id):
     Returns:
         github.PullRequest.PullRequest
     """
-    github = Github(token, timeout=60)
+    github = Github(timeout=60)
     repo = github.get_repo('PaddlePaddle/Paddle')
     pull = repo.get_pull(pull_id)
 
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 715bd34b908be..08536ae401fe1 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -1,10 +1,10 @@
 # A image for building paddle binaries
 
 # build docker image
-# docker build -t paddlepaddle/paddle:ipu-dev-2.3.0 -f tools/dockerfile/Dockerfile.ipu .
+# docker build -t paddlepaddle/paddle:latest-dev-ipu -f tools/dockerfile/Dockerfile.ipu .
 
 # run a container
-# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:ipu-dev-2.3.0 bash
+# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
 FROM graphcore/poplar:2.3.0
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 6b90a656f0107..799f80f139c9c 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -35,7 +35,7 @@ class PRChecker(object):
     """ PR Checker. """
 
     def __init__(self):
-        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.github = Github(timeout=60)
         self.repo = self.github.get_repo('PaddlePaddle/Paddle')
         self.py_prog_oneline = re.compile('\d+\|\s*#.*')
         self.py_prog_multiline_a = re.compile('\d+\|\s*r?""".*?"""', re.DOTALL)
diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py
new file mode 100644
index 0000000000000..daf80597d3ad0
--- /dev/null
+++ b/tools/get_ut_mem_map.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+
+
+def get_ut_mem(rootPath):
+    case_dic = {}
+    for parent, dirs, files in os.walk(rootPath):
+        for f in files:
+            if f.endswith('$-gpu.log'):
+                continue
+            ut = f.replace('^', '').replace('$.log', '')
+            case_dic[ut] = {}
+            filename = '%s%s' % (parent, f)
+            fi = open(filename)
+            lines = fi.readlines()
+            mem_reserved1 = -1
+            mem_nvidia1 = -1
+            caseTime = -1
+            for line in lines:
+                if '[Memory Usage (Byte)] gpu' in line:
+                    mem_reserved = round(
+                        float(
+                            line.split('[max memory reserved] gpu')[1].split(
+                                ':')[1].split('\\n')[0].strip()), 2)
+                    if mem_reserved > mem_reserved1:
+                        mem_reserved1 = mem_reserved
+                if 'MAX_GPU_MEMORY_USE=' in line:
+                    mem_nvidia = round(
+                        float(
+                            line.split('MAX_GPU_MEMORY_USE=')[1].split('\\n')[0]
+                            .strip()), 2)
+                    if mem_nvidia > mem_nvidia1:
+                        mem_nvidia1 = mem_nvidia
+                if 'Total Test time (real)' in line:
+                    caseTime = float(
+                        line.split('Total Test time (real) =')[1].split('sec')[
+                            0].strip())
+            if mem_reserved1 != -1:
+                case_dic[ut]['mem_reserved'] = mem_reserved1
+            if mem_nvidia1 != -1:
+                case_dic[ut]['mem_nvidia'] = mem_nvidia1
+            if caseTime != -1:
+                case_dic[ut]['time'] = caseTime
+
+    ut_mem_map_file = "/pre_test/ut_mem_map.json" % rootPath
+    with open(ut_mem_map_file, "w") as f:
+        json.dump(case_dic, f)
+
+
+if __name__ == "__main__":
+    rootPath = sys.argv[1]
+    get_ut_mem(rootPath)
diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh
index bf70d8bc3a495..0b2fff045ff3c 100644
--- a/tools/test_ci_op_benchmark.sh
+++ b/tools/test_ci_op_benchmark.sh
@@ -319,7 +319,7 @@ function gpu_op_benchmark {
 # The PR will pass quickly when get approval from specific person.
 # Xreki 12538138, luotao1 6836917, ZzSean 32410583
 set +x
-approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
+approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
   APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 2d0c9c4a131c9..7ceed18634a87 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -20,6 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import importlib
+import paddle.fluid.core as core
 from six.moves import cStringIO
 
 sys.path.append(os.path.abspath(os.path.dirname(__file__)))
@@ -28,6 +29,10 @@
 
 def main():
     sys.path.append(os.getcwd())
+    if core.is_compiled_with_cuda() or core.is_compiled_with_rocm():
+        if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None):
+            os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true'
+
     some_test_failed = False
     for module_name in sys.argv[1:]:
         flag_need_static_mode = False
@@ -45,6 +50,7 @@ def main():
                     module = importlib.import_module(module_name)
                     tests = test_loader.loadTestsFromModule(module)
                     res = unittest.TextTestRunner(stream=buffer).run(tests)
+
                     if not res.wasSuccessful():
                         some_test_failed = True
                         print(
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index e102552f87c2b..fb173442a3319 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -13,11 +13,11 @@
 :: limitations under the License.
 ::
 :: ===============================
-:: Build Paddle compile enviroment
+:: Build Paddle compile environment
 :: ===============================
 :: Description:
 ::   
-::   Install compile enviroment for xly CI.
+::   Install compile environment for xly CI.
 ::
 ::   Include:
 ::     1. CMake 3.17.0
diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh
index 576f0e5d238ab..136e21e60415f 100644
--- a/tools/windows/check_change_of_unittest.sh
+++ b/tools/windows/check_change_of_unittest.sh
@@ -15,16 +15,15 @@
 set -e
 set +x
 export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
-GITHUB_API_TOKEN=$GITHUB_API_TOKEN
 GIT_PR_ID=$AGILE_PULL_ID
 BRANCH=$BRANCH
-if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then
+if [ "${GIT_PR_ID}" == "" ];then
     exit 0 
 fi
 
 unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g')
 if [ "$unittest_spec_diff" != "" ]; then
-    approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+    approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
     APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
     echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
     if [ "${APPROVALS}" == "FALSE" ]; then