diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 75f4f19244494..4894d615c2a35 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -132,7 +132,11 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    set(cuda_arch_bin "80")
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
+      set(cuda_arch_bin "80")
+    elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
+      set(cuda_arch_bin "80 86")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
index cd4e0157f2a32..004bf353d34e8 100644
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
@@ -26,7 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_GIT_TAG 1fd85187b6c18da4dd51f22619d093ef08d61b01)
+set(CINN_GIT_TAG eedb801ca39bfc6b9621bc76c24a0bf98cb8404b)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}
@@ -85,4 +85,3 @@ add_library(cinn SHARED IMPORTED GLOBAL)
 set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
 include_directories(${CINN_INCLUDE_DIR})
 add_dependencies(cinn external_cinn)
-
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index f1d206dd5e199..0031757467f37 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -50,7 +50,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 4ab64daecc11fbf74fffdc6a4733f388472e7d5d)
+    set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 1b38f208716b3..af2be77d0a63d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -248,15 +248,24 @@ copy(inference_lib_dist
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/common/*.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/core/macros.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/phi/core/visit_type.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
-        copy(inference_lib_dist
+copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/flat_hash_map.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
diff --git a/cmake/neuware.cmake b/cmake/neuware.cmake
index 811c8d664a097..a371a0032d991 100644
--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
@@ -17,13 +17,16 @@ INCLUDE_DIRECTORIES(${NEUWARE_INCLUDE_DIR})
 set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
 set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
 set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
+set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
 
 generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
+set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB})
+
 if(WITH_CNCL)
       MESSAGE(STATUS "Compile with CNCL!")
       ADD_DEFINITIONS(-DPADDLE_WITH_CNCL)
       set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
-      TARGET_LINK_LIBRARIES(neuware_lib ${CNCL_LIB} ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
-else()
-      TARGET_LINK_LIBRARIES(neuware_lib ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB})
+      list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
 endif()
+
+TARGET_LINK_LIBRARIES(neuware_lib ${NEUWARE_LIB_DEPS})
diff --git a/cmake/phi_header.cmake b/cmake/phi_header.cmake
index c9b7e465337dd..b23b4086b18f2 100644
--- a/cmake/phi_header.cmake
+++ b/cmake/phi_header.cmake
@@ -36,7 +36,8 @@ phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experiment
 phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext)
 phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include)
 phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h
-            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h)
\ No newline at end of file
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/ext_all.h)
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc
index 6fec3a41e1047..e6d9975f75db6 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
@@ -35,8 +35,9 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
 
 void ProcessGroup::Task::Synchronize() {}
 
-ProcessGroup::ProcessGroup(int rank, int size, int gid)
-    : rank_(rank), size_(size), gid_(gid) {
+ProcessGroup::ProcessGroup(int rank, int size, const platform::Place& place,
+                           int gid)
+    : rank_(rank), size_(size), place_(place), gid_(gid) {
   if (gid != IGNORE_ID) {
     auto map = ProcessGroupMapFromGid::getInstance();
     map->insert(gid_, this);
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index fbc9c1f476202..fca395c5f2bf7 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -69,7 +69,8 @@ class ProcessGroup {
     bool is_completed_ = false;
   };
 
-  explicit ProcessGroup(int rank, int size, int gid);
+  explicit ProcessGroup(int rank, int size, const platform::Place& place,
+                        int gid);
   virtual ~ProcessGroup() {}
 
   int GetRank() const { return rank_; }
@@ -145,6 +146,7 @@ class ProcessGroup {
  protected:
   const int rank_;
   const int size_;
+  const platform::Place place_;
   const int gid_;
 };
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index 6ddea74d95db6..824341c3cd97d 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -165,8 +165,9 @@ ProcessGroupGloo::GlooTask::GlooTask(
 
 ProcessGroupGloo::ProcessGroupGloo(
     const std::shared_ptr<distributed::Store>& store, int rank, int world_size,
-    int gid, const std::shared_ptr<GlooOptions> options)
-    : ProcessGroup(rank, world_size, gid),
+    const platform::Place& place, int gid,
+    const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size, place, gid),
       _tag(0),
       _store(new GlooStore(store)) {
   _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 335ca1bd17f2c..1eb8b47a09223 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -102,7 +102,8 @@ class ProcessGroupGloo : public ProcessGroup {
 
   explicit ProcessGroupGloo(
       const std::shared_ptr<paddle::distributed::Store>& store, int rank,
-      int world_size, int gid, std::shared_ptr<GlooOptions> options);
+      int world_size, const platform::Place& place, int gid,
+      std::shared_ptr<GlooOptions> options);
 
   ~ProcessGroupGloo() = default;
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 55ecdaaf6bfb7..9ed6c2198df4c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -97,8 +98,11 @@ bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
 void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
 ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
-                                   int rank, int size, int gid)
-    : ProcessGroup(rank, size, gid), store_(store) {}
+                                   int rank, int size,
+                                   const platform::Place& place, int gid)
+    : ProcessGroup(rank, size, place, gid), store_(store) {
+  platform::SetNPUDeviceId(place_.device);
+}
 
 void ProcessGroupHCCL::BroadcastUniqueHCCLID(
     std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index f3d3fa2f8a72a..2f0ff6b9565ea 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -71,7 +71,7 @@ class ProcessGroupHCCL : public ProcessGroup {
   };
 
   ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size,
-                   int gid);
+                   const platform::Place& place, int gid);
 
   const std::string GetBackendName() const override {
     return std::string(HCCL_BACKEND_NAME);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index 354a8e23ae41f..ef57bb5ba232c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -44,13 +44,11 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
   return true;
 }
 
-ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
-                                     int rank, int size, int gid,
-                                     int local_rank, int local_size,
-                                     int gloo_rank, int gloo_size,
-                                     bool with_switch,
-                                     std::string switch_endpoint)
-    : ProcessGroup(rank, size, gid),
+ProcessGroupHeter::ProcessGroupHeter(
+    const std::shared_ptr<Store>& store, int rank, int size,
+    const platform::Place& place, int gid, int local_rank, int local_size,
+    int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint)
+    : ProcessGroup(rank, size, place, gid),
       store_(store),
       local_rank_(local_rank),
       local_size_(local_size),
@@ -60,10 +58,10 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
       switch_endpoint_(switch_endpoint) {
 #if defined(PADDLE_WITH_NCCL)
   inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size,
-                                                 IGNORE_ID);
+                                                 place_, IGNORE_ID);
 #elif defined(PADDLE_WITH_ASCEND_CL)
   inner_pg_ = std::make_shared<ProcessGroupHCCL>(store, local_rank, local_size,
-                                                 IGNORE_ID);
+                                                 place_, IGNORE_ID);
 #else
   PADDLE_THROW(platform::errors::Fatal(
       "ProcessGroupHeter only supports NCCL and HCCL now.");
@@ -71,8 +69,8 @@ ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
   if (local_rank_ == 0 && !with_switch_) {
     auto opts = ProcessGroupGloo::GlooOptions::create();
     opts->device = ProcessGroupGloo::createDefaultDevice();
-    inter_pg_ = std::make_shared<ProcessGroupGloo>(store, gloo_rank_,
-                                                   gloo_size_, IGNORE_ID, opts);
+    inter_pg_ = std::make_shared<ProcessGroupGloo>(
+        store, gloo_rank_, gloo_size_, place_, IGNORE_ID, opts);
   }
 }
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index 05bacd93d7815..640acdfb6a23b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -81,9 +81,9 @@ class ProcessGroupHeter : public ProcessGroup {
   };
 
   ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
-                    int gid, int local_rank, int local_size, int gloo_rank,
-                    int gloo_size, bool with_switch,
-                    std::string switch_endpoints);
+                    const platform::Place& place, int gid, int local_rank,
+                    int local_size, int gloo_rank, int gloo_size,
+                    bool with_switch, std::string switch_endpoints);
 
   const std::string GetBackendName() const override {
     return std::string(HETER_BACKEND_NAME);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 30813b904df53..86cc5b5db7cd7 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -103,8 +104,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
 void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
 ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
-                                   int rank, int size, int gid)
-    : ProcessGroup(rank, size, gid), store_(store) {}
+                                   int rank, int size,
+                                   const platform::Place& place, int gid)
+    : ProcessGroup(rank, size, place, gid), store_(store) {
+  platform::SetDeviceId(place_.device);
+}
 
 void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
@@ -349,21 +353,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
 
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
     const BarrierOptions& opts) {
-  std::vector<phi::GPUPlace> places;
-
-  if (!opts.place_ids.empty()) {
-    for (auto place_id : opts.place_ids) {
-      places.emplace_back(place_id);
-    }
-  } else if (!used_place_ids_.empty()) {
-    for (auto place_id : used_place_ids_) {
-      places.emplace_back(place_id);
-    }
-  } else {
-    auto numGPUs = GetSize();
-    int place_id = static_cast<int>(rank_ % numGPUs);
-    places.emplace_back(place_id);
-  }
+  // Only support single card single process
+  std::vector<phi::GPUPlace> places = {place_};
 
   std::vector<phi::DenseTensor> barrierTensors;
   barrierTensors.reserve(places.size());
@@ -371,7 +362,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
   platform::CUDADeviceGuard gpuGuard;
   for (auto& place : places) {
     gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, place);
     barrierTensors.push_back(
         *std::dynamic_pointer_cast<phi::DenseTensor>(dt.impl()));
   }
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index cca84285ef4de..4b6c3f4031354 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -77,7 +77,7 @@ class ProcessGroupNCCL : public ProcessGroup {
   };
 
   ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size,
-                   int gid);
+                   const platform::Place& place, int gid);
 
   const std::string GetBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 2d2a3b688fefe..53bae87c0020e 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -186,7 +186,13 @@ int64_t Carrier::GetRank(int64_t interceptor_id) const {
 }
 
 bool Carrier::Send(const InterceptorMessage& msg) {
-  int64_t src_id = (msg.src_id() == -1) ? msg.dst_id() : msg.src_id();
+  int64_t src_id = msg.src_id();
+  // TODO(liyurui): compatible solution, will be removed completely in the
+  // future
+  if (interceptor_id_to_rank_.find(src_id) == interceptor_id_to_rank_.end() &&
+      src_id == SOURCE_ID) {
+    src_id = msg.dst_id();
+  }
   int64_t dst_id = msg.dst_id();
   int64_t src_rank = GetRank(src_id);
   int64_t dst_rank = GetRank(dst_id);
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index f49c84e6e5edc..fb907e3b5c29f 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -161,7 +161,7 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
     VLOG(3) << "ComputeInterceptor " << interceptor_id_
             << " Reply data_is_useless msg to " << up_id
             << " for step: " << step_;
-    if (up_id == -1) return;
+    if (is_source_ && up_id == -1) return;
 
     InterceptorMessage reply_msg;
     reply_msg.set_message_type(DATA_IS_USELESS);
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index c1408130b5e57..cacd55e02a5e2 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -168,7 +168,7 @@ bool DistModel::Init() {
   if (!PrepareFeedAndFetch()) {
     return false;
   }
-  if (!CommInit()) {
+  if (config_.nranks > 1 && !CommInit()) {
     return false;
   }
   if (!PrepareFleetExe()) {
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index cb7ff2da89a9d..86ca7be7f44db 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -40,6 +40,9 @@ class TaskNode;
 class Carrier;
 class TaskLoop;
 
+constexpr int64_t SOURCE_ID = -1;
+constexpr int64_t SINK_ID = -2;
+
 class Interceptor {
  public:
   using MsgHandle = std::function<void(const InterceptorMessage&)>;
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
index 7cf99e8741943..8508bc35f29be 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
@@ -27,8 +27,8 @@ enum MessageType {
 }
 
 message InterceptorMessage {
-  optional int64 src_id = 1 [ default = 0 ];
-  optional int64 dst_id = 2 [ default = 0 ];
+  optional sint64 src_id = 1 [ default = 0 ];
+  optional sint64 dst_id = 2 [ default = 0 ];
   optional MessageType message_type = 3 [ default = RESET ];
   optional bool ctrl_message = 4 [ default = false ];
   optional int64 scope_idx = 5 [ default = 0 ];
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
index af707c28acd9e..77fbb23a6c71b 100644
--- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.cc
@@ -30,7 +30,7 @@ SinkInterceptor::SinkInterceptor(int64_t interceptor_id, TaskNode* node)
 void SinkInterceptor::StopCarrierIfComplete() {
   bool flag = true;
   for (const auto& up : upstream_step_) {
-    flag = flag & (up.second == max_run_times_);
+    flag = flag && (up.second == max_run_times_);
   }
   if (flag) {
     VLOG(3) << "Sink Interceptor is stopping carrier";
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 95e4c73305998..232317333ea11 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -74,6 +74,9 @@ void TaskNode::Init(bool use_feed_fetch_ops) {
   }
 }
 
+TaskNode::TaskNode(int64_t rank, int64_t task_id, int64_t max_run_times)
+    : rank_(rank), task_id_(task_id), max_run_times_(max_run_times) {}
+
 TaskNode::TaskNode(int32_t role,
                    const std::vector<framework::OpDesc*>& op_descs,
                    int64_t rank, int64_t task_id, int64_t max_run_times,
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 4764d4fd4af87..7dd4b5454567e 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -32,6 +32,7 @@ namespace distributed {
 class TaskNode final {
  public:
   using OperatorBase = paddle::framework::OperatorBase;
+  TaskNode(int64_t rank, int64_t task_id, int64_t max_run_times);
   TaskNode(int32_t role, int64_t rank, int64_t task_id, int64_t max_run_times,
            int64_t max_slot_nums);
   TaskNode(int32_t role, const std::vector<framework::OpDesc*>& op_descs,
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index ba039385a74ba..35857fc86b5e0 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -69,32 +69,42 @@ TEST(ComputeInterceptor, Compute) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{0, 0}, {1, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}});
 
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // FIXME: don't delete, otherwise interceptor will use undefined node
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, 2);  // rank, task_id, max_run_times
   TaskNode* node_a =
       new TaskNode(0, ops, 0, 0, 2, 0);  // role, ops, rank, task_id
   TaskNode* node_b = new TaskNode(0, 0, 1, 2, 0);
+  TaskNode* sink = new TaskNode(0, SINK_ID, 2);
 
-  // a->b
+  // source->a->b->sink
+  source->AddDownstreamTask(0);
+  node_a->AddUpstreamTask(SOURCE_ID);
   node_a->AddDownstreamTask(1);
   node_b->AddUpstreamTask(0);
+  sink->AddUpstreamTask(1);
+  node_b->AddDownstreamTask(SINK_ID);
 
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
   auto* a = carrier->SetInterceptor(
       0, InterceptorFactory::Create("Compute", 0, node_a));
   carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));
 
   a->SetPlace(place);
   a->SetMicroBatchScope(scopes);
 
   // start
   InterceptorMessage msg;
-  msg.set_message_type(DATA_IS_READY);
-  msg.set_src_id(-1);
-  msg.set_dst_id(0);
+  msg.set_message_type(START);
+  msg.set_dst_id(SOURCE_ID);
   carrier->EnqueueInterceptorMessage(msg);
 
   carrier->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
index 3860e9f4e137e..e909744a4b5d6 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -55,27 +55,39 @@ TEST(AmplifierInterceptor, Amplifier) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0},
+                    {0, 0},
+                    {1, 0},
+                    {2, 0},
+                    {3, 0},
+                    {4, 0},
+                    {5, 0},
+                    {SINK_ID, 0}});
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
   int64_t micro_steps = 3;
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, micro_steps);  // rank, task_id, max_run_times
   TaskNode* node_a = new TaskNode(0, 0, 0, 1, 0);  // role, rank, task_id
   TaskNode* node_b = new TaskNode(0, 0, 1, 1, 0);
   TaskNode* node_c = new TaskNode(0, 0, 2, 1, 0);
   TaskNode* node_d = new TaskNode(0, 0, 3, 1, 0);
   TaskNode* node_e = new TaskNode(0, 0, 4, 1, 0);
   TaskNode* node_f = new TaskNode(0, 0, 5, 1, 0);
+  TaskNode* sink = new TaskNode(0, SINK_ID, micro_steps);
 
-  // a->b->c->d->e->f
-  LinkNodes({node_a, node_b, node_c, node_d, node_e, node_f});
+  // source->a->b->c->d->e->f->sink
+  LinkNodes({source, node_a, node_b, node_c, node_d, node_e, node_f, sink});
 
   // LR->b(1:3)->F->B->e(3:1)->U
   node_b->SetReplyUpPerSteps(micro_steps);
   node_e->SetSendDownPerSteps(micro_steps);
 
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
   carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
   carrier->SetInterceptor(1,
                           InterceptorFactory::Create("Amplifier", 1, node_b));
@@ -84,12 +96,13 @@ TEST(AmplifierInterceptor, Amplifier) {
   carrier->SetInterceptor(4,
                           InterceptorFactory::Create("Amplifier", 4, node_e));
   carrier->SetInterceptor(5, InterceptorFactory::Create("Compute", 5, node_f));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));
 
   // start
   InterceptorMessage msg;
-  msg.set_message_type(DATA_IS_READY);
-  msg.set_src_id(-1);
-  msg.set_dst_id(0);
+  msg.set_message_type(START);
+  msg.set_dst_id(SOURCE_ID);
   carrier->EnqueueInterceptorMessage(msg);
   carrier->Wait();
   carrier->Release();
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index b510b68e4e2ed..0e57596bacbe6 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -73,39 +73,47 @@ TEST(AmplifierInterceptor, Amplifier) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}, {3, 0}});
+  carrier->Init(0,
+                {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {2, 0}, {3, 0}, {SINK_ID, 0}});
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, ""}}, "");
 
   int64_t micro_steps = 6;
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, micro_steps);  // rank, task_id, max_run_times
   TaskNode* node_a =
       new TaskNode(0, 0, 0, micro_steps, 0);  // role, rank, task_id
   TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
   TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
   TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps, 0);
+  TaskNode* sink = new TaskNode(0, SINK_ID, micro_steps);
 
-  // a->b->c->d
+  // source->a->b->c->d->sink
   // LR->F->B->U
-  LinkNodes({node_a, node_b, node_c, node_d}, {{{node_b, node_c}, 1}});
+  LinkNodes({source, node_a, node_b, node_c, node_d, sink},
+            {{{node_b, node_c}, 1}});
 
   node_a->SetRunPerSteps(micro_steps);
   node_d->SetRunPerSteps(micro_steps);
   node_d->SetRunAtOffset(micro_steps - 1);
 
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
   carrier->SetInterceptor(0,
                           InterceptorFactory::Create("Amplifier", 0, node_a));
   carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
   carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
   carrier->SetInterceptor(3,
                           InterceptorFactory::Create("Amplifier", 3, node_d));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));
 
   // start
   InterceptorMessage msg;
-  msg.set_message_type(DATA_IS_READY);
-  msg.set_src_id(-1);
-  msg.set_dst_id(0);
+  msg.set_message_type(START);
+  msg.set_dst_id(SOURCE_ID);
   carrier->EnqueueInterceptorMessage(msg);
   carrier->Wait();
   carrier->Release();
diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
index 6b1a555e987a3..8ff908f90ec85 100644
--- a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
@@ -39,10 +39,10 @@ class FakeInterceptor : public Interceptor {
                 << std::endl;
       InterceptorMessage reply;
       reply.set_message_type(DATA_IS_USELESS);
-      Send(-1, reply);
+      Send(SOURCE_ID, reply);
       InterceptorMessage ready;
       ready.set_message_type(DATA_IS_READY);
-      Send(-2, ready);
+      Send(SINK_ID, ready);
     } else if (msg.message_type() == DATA_IS_USELESS) {
       std::cout << "FakeInterceptor remove result in scope " << msg.scope_idx()
                 << std::endl;
@@ -57,28 +57,31 @@ TEST(SourceInterceptor, Source) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{-1, 0}, {0, 0}, {-2, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {SINK_ID, 0}});
 
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* source = new TaskNode(0, -1, 0, 3, 0);  // role, rank, task_id
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);   // role, rank, task_id
-  TaskNode* sink = new TaskNode(0, -2, 0, 3, 0);    // role, rank, task_id
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, 0, 3, 0);             // role, rank, task_id
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);      // role, rank, task_id
+  TaskNode* sink = new TaskNode(0, SINK_ID, 0, 3, 0);  // role, rank, task_id
 
   source->AddDownstreamTask(0, 1);
-  node_a->AddUpstreamTask(-1, 1);
-  node_a->AddDownstreamTask(-2, 1);
+  node_a->AddUpstreamTask(SOURCE_ID, 1);
+  node_a->AddDownstreamTask(SINK_ID, 1);
   sink->AddUpstreamTask(0, 1);
-  carrier->SetInterceptor(-1, InterceptorFactory::Create("Source", -1, source));
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
   carrier->SetInterceptor(0, std::make_unique<FakeInterceptor>(0, node_a));
-  carrier->SetInterceptor(-2, InterceptorFactory::Create("Sink", -2, sink));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));
 
   // start
   InterceptorMessage msg;
   msg.set_message_type(START);
-  msg.set_dst_id(-1);
+  msg.set_dst_id(SOURCE_ID);
   carrier->EnqueueInterceptorMessage(msg);
 
   carrier->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
index cf49e97474af0..e9c0437c829d4 100644
--- a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
@@ -40,7 +40,7 @@ class FakeInterceptor : public Interceptor {
                 << std::endl;
       InterceptorMessage reply;
       reply.set_message_type(DATA_IS_USELESS);
-      Send(-1, reply);
+      Send(SOURCE_ID, reply);
       step_++;
       if (step_ == node_->max_run_times()) {
         carrier_->WakeUp();
@@ -56,24 +56,26 @@ TEST(SourceInterceptor, Source) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{-1, 0}, {0, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}});
 
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* source = new TaskNode(0, -1, 0, 3, 0);  // role, rank, task_id
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);   // role, rank, task_id
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, 0, 3, 0);         // role, rank, task_id
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
 
   source->AddDownstreamTask(0, 1);
-  node_a->AddUpstreamTask(-1, 1);
-  carrier->SetInterceptor(-1, InterceptorFactory::Create("Source", -1, source));
+  node_a->AddUpstreamTask(SOURCE_ID, 1);
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
   carrier->SetInterceptor(0, std::make_unique<FakeInterceptor>(0, node_a));
 
   // start
   InterceptorMessage msg;
   msg.set_message_type(START);
-  msg.set_dst_id(-1);
+  msg.set_dst_id(SOURCE_ID);
   carrier->EnqueueInterceptorMessage(msg);
 
   carrier->Wait();
diff --git a/paddle/fluid/distributed/ps/README.md b/paddle/fluid/distributed/ps/README.md
index afa6d60a4e0bb..728ab0ddae287 100755
--- a/paddle/fluid/distributed/ps/README.md
+++ b/paddle/fluid/distributed/ps/README.md
@@ -10,7 +10,7 @@ Table: for param storage and update
 
 ValueAccessor: for pull param and push gradient
 -----CtrCommonAccessor: pull/push value with show/click, float type
------DownpourCtrDoubleAccessor: same as CtrCommonAccessor, other than show/click with double type
+-----CtrDoubleAccessor: same as CtrCommonAccessor, other than show/click with double type
 -----SparseAccessor: used for common embedding, pull value without show/click, push value with show/click
 -----CommMergeAccessor: used for dense table only, for get param dim
 
diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
index aebe36b5e0496..bb6725b08425a 100644
--- a/paddle/fluid/distributed/ps/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -42,8 +42,7 @@ set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
-cc_library(ctr_double_accessor SRCS ctr_double_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
-cc_library(ctr_accessor SRCS ctr_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
+cc_library(ctr_accessor SRCS ctr_accessor.cc ctr_double_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)
 
 set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index df55fe93be3d8..d7ceb4a18ea19 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -430,8 +430,9 @@ int32_t GraphTable::add_comm_edge(int64_t src_id, int64_t dst_id) {
     return -1;
   }
   size_t index = src_shard_id - shard_start;
-  extra_shards[index]->add_graph_node(src_id)->build_edges(false);
-  extra_shards[index]->add_neighbor(src_id, dst_id, 1.0);
+  VLOG(0) << "index add edge " << src_id << " " << dst_id;
+  shards[index]->add_graph_node(src_id)->build_edges(false);
+  shards[index]->add_neighbor(src_id, dst_id, 1.0);
   return 0;
 }
 int32_t GraphTable::add_graph_node(std::vector<int64_t> &id_list,
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 4446c8297c5b3..715abe270e52b 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -35,6 +35,10 @@ int CtrCommonAccessor::Initialize() {
   common_feature_value.embedx_sgd_dim = _embedx_sgd_rule->Dim();
   _show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate();
 
+  if (_config.ctr_accessor_param().show_scale()) {
+    _show_scale = true;
+  }
+
   InitAccessorInfo();
   return 0;
 }
@@ -233,6 +237,11 @@ int32_t CtrCommonAccessor::Update(float** update_values,
         push_click * _config.ctr_accessor_param().click_coeff();
     update_value[common_feature_value.UnseenDaysIndex()] = 0;
     // TODO(zhaocaibei123): add configure show_scale
+    if (!_show_scale) {
+      push_show = 1;
+    }
+    VLOG(3) << "accessor show scale:" << _show_scale
+            << ", push_show:" << push_show;
     _embed_sgd_rule->UpdateValue(
         update_value + common_feature_value.EmbedWIndex(),
         update_value + common_feature_value.EmbedG2SumIndex(),
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index 740b03a84e461..f0d9426343d7b 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace distributed {
 
-int DownpourCtrDoubleAccessor::Initialize() {
+int CtrDoubleAccessor::Initialize() {
   auto name = _config.embed_sgd_param().name();
   _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
   _embed_sgd_rule->LoadConfig(_config.embed_sgd_param(), 1);
@@ -34,14 +34,18 @@ int DownpourCtrDoubleAccessor::Initialize() {
   _ssd_unseenday_threshold =
       _config.ctr_accessor_param().ssd_unseenday_threshold();
 
+  if (_config.ctr_accessor_param().show_scale()) {
+    _show_scale = true;
+  }
+
   InitAccessorInfo();
   return 0;
 }
 
-void DownpourCtrDoubleAccessor::InitAccessorInfo() {
+void CtrDoubleAccessor::InitAccessorInfo() {
   auto embedx_dim = _config.embedx_dim();
-  _accessor_info.dim = DownpourCtrDoubleFeatureValue::Dim(embedx_dim);
-  _accessor_info.size = DownpourCtrDoubleFeatureValue::Size(embedx_dim);
+  _accessor_info.dim = CtrDoubleFeatureValue::Dim(embedx_dim);
+  _accessor_info.size = CtrDoubleFeatureValue::Size(embedx_dim);
   _accessor_info.select_dim = 3 + embedx_dim;
   _accessor_info.select_size = _accessor_info.select_dim * sizeof(float);
   _accessor_info.update_dim = 4 + embedx_dim;
@@ -49,7 +53,7 @@ void DownpourCtrDoubleAccessor::InitAccessorInfo() {
   _accessor_info.mf_size = (embedx_dim + 1) * sizeof(float);
 }
 
-bool DownpourCtrDoubleAccessor::Shrink(float* value) {
+bool CtrDoubleAccessor::Shrink(float* value) {
   // auto base_threshold = _config.ctr_accessor_param().base_threshold();
   // auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
   // auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
@@ -59,38 +63,37 @@ bool DownpourCtrDoubleAccessor::Shrink(float* value) {
       _config.ctr_accessor_param().delete_after_unseen_days();
   auto delete_threshold = _config.ctr_accessor_param().delete_threshold();
   // time_decay first
-  DownpourCtrDoubleFeatureValue::Show(value) *= _show_click_decay_rate;
-  DownpourCtrDoubleFeatureValue::Click(value) *= _show_click_decay_rate;
+  CtrDoubleFeatureValue::Show(value) *= _show_click_decay_rate;
+  CtrDoubleFeatureValue::Click(value) *= _show_click_decay_rate;
   // shrink after
-  auto score = ShowClickScore(DownpourCtrDoubleFeatureValue::Show(value),
-                              DownpourCtrDoubleFeatureValue::Click(value));
-  auto unseen_days = DownpourCtrDoubleFeatureValue::UnseenDays(value);
+  auto score = ShowClickScore(CtrDoubleFeatureValue::Show(value),
+                              CtrDoubleFeatureValue::Click(value));
+  auto unseen_days = CtrDoubleFeatureValue::UnseenDays(value);
   if (score < delete_threshold || unseen_days > delete_after_unseen_days) {
     return true;
   }
   return false;
 }
-bool DownpourCtrDoubleAccessor::save_ssd(float* value) {
-  if (DownpourCtrDoubleFeatureValue::UnseenDays(value) >
-      _ssd_unseenday_threshold) {
+bool CtrDoubleAccessor::SaveSSD(float* value) {
+  if (CtrDoubleFeatureValue::UnseenDays(value) > _ssd_unseenday_threshold) {
     return true;
   }
   return false;
 }
-// bool DownpourCtrDoubleAccessor::save_cache(
+// bool CtrDoubleAccessor::save_cache(
 //         float* value, int param, double global_cache_threshold) {
 //     auto base_threshold = _config.ctr_accessor_param().base_threshold();
 //     auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
-//     if (ShowClickScore(DownpourCtrDoubleFeatureValue::Show(value),
-//     DownpourCtrDoubleFeatureValue::Click(value)) >= base_threshold
-//         && DownpourCtrDoubleFeatureValue::UnseenDays(value) <=
+//     if (ShowClickScore(CtrDoubleFeatureValue::Show(value),
+//     CtrDoubleFeatureValue::Click(value)) >= base_threshold
+//         && CtrDoubleFeatureValue::UnseenDays(value) <=
 //         delta_keep_days) {
-//         return DownpourCtrDoubleFeatureValue::Show(value) >
+//         return CtrDoubleFeatureValue::Show(value) >
 //         global_cache_threshold;
 //     }
 //     return false;
 // }
-bool DownpourCtrDoubleAccessor::Save(float* value, int param) {
+bool CtrDoubleAccessor::Save(float* value, int param) {
   // auto base_threshold = _config.ctr_accessor_param().base_threshold();
   // auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
   // auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
@@ -109,14 +112,14 @@ bool DownpourCtrDoubleAccessor::Save(float* value, int param) {
     case 1:
     // save xbox base
     case 2: {
-      if (ShowClickScore(DownpourCtrDoubleFeatureValue::Show(value),
-                         DownpourCtrDoubleFeatureValue::Click(value)) >=
+      if (ShowClickScore(CtrDoubleFeatureValue::Show(value),
+                         CtrDoubleFeatureValue::Click(value)) >=
               base_threshold &&
-          DownpourCtrDoubleFeatureValue::DeltaScore(value) >= delta_threshold &&
-          DownpourCtrDoubleFeatureValue::UnseenDays(value) <= delta_keep_days) {
+          CtrDoubleFeatureValue::DeltaScore(value) >= delta_threshold &&
+          CtrDoubleFeatureValue::UnseenDays(value) <= delta_keep_days) {
         // do this after save, because it must not be modified when retry
         if (param == 2) {
-          DownpourCtrDoubleFeatureValue::DeltaScore(value) = 0;
+          CtrDoubleFeatureValue::DeltaScore(value) = 0;
         }
         return true;
       } else {
@@ -125,10 +128,10 @@ bool DownpourCtrDoubleAccessor::Save(float* value, int param) {
     }
     // already decayed in shrink
     case 3: {
-      // DownpourCtrFeatureValue::Show(value) *= _show_click_decay_rate;
-      // DownpourCtrFeatureValue::Click(value) *= _show_click_decay_rate;
+      // CtrDoubleFeatureValue::Show(value) *= _show_click_decay_rate;
+      // CtrDoubleFeatureValue::Click(value) *= _show_click_decay_rate;
       // do this after save, because it must not be modified when retry
-      // DownpourCtrDoubleFeatureValue::UnseenDays(value)++;
+      // CtrDoubleFeatureValue::UnseenDays(value)++;
       return true;
     }
     default:
@@ -136,7 +139,7 @@ bool DownpourCtrDoubleAccessor::Save(float* value, int param) {
   };
 }
 
-void DownpourCtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) {
+void CtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) {
   auto base_threshold = _config.ctr_accessor_param().base_threshold();
   auto delta_threshold = _config.ctr_accessor_param().delta_threshold();
   auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days();
@@ -145,17 +148,17 @@ void DownpourCtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) {
   }
   switch (param) {
     case 1: {
-      if (ShowClickScore(DownpourCtrDoubleFeatureValue::Show(value),
-                         DownpourCtrDoubleFeatureValue::Click(value)) >=
+      if (ShowClickScore(CtrDoubleFeatureValue::Show(value),
+                         CtrDoubleFeatureValue::Click(value)) >=
               base_threshold &&
-          DownpourCtrDoubleFeatureValue::DeltaScore(value) >= delta_threshold &&
-          DownpourCtrDoubleFeatureValue::UnseenDays(value) <= delta_keep_days) {
-        DownpourCtrDoubleFeatureValue::DeltaScore(value) = 0;
+          CtrDoubleFeatureValue::DeltaScore(value) >= delta_threshold &&
+          CtrDoubleFeatureValue::UnseenDays(value) <= delta_keep_days) {
+        CtrDoubleFeatureValue::DeltaScore(value) = 0;
       }
     }
       return;
     case 3: {
-      DownpourCtrDoubleFeatureValue::UnseenDays(value)++;
+      CtrDoubleFeatureValue::UnseenDays(value)++;
     }
       return;
     default:
@@ -163,123 +166,125 @@ void DownpourCtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) {
   };
 }
 
-int32_t DownpourCtrDoubleAccessor::Create(float** values, size_t num) {
+int32_t CtrDoubleAccessor::Create(float** values, size_t num) {
   auto embedx_dim = _config.embedx_dim();
   for (size_t value_item = 0; value_item < num; ++value_item) {
     float* value = values[value_item];
-    value[DownpourCtrDoubleFeatureValue::UnseenDaysIndex()] = 0;
-    value[DownpourCtrDoubleFeatureValue::DeltaScoreIndex()] = 0;
-    *(double*)(value + DownpourCtrDoubleFeatureValue::ShowIndex()) = 0;
-    *(double*)(value + DownpourCtrDoubleFeatureValue::ClickIndex()) = 0;
-    value[DownpourCtrDoubleFeatureValue::SlotIndex()] = -1;
+    value[CtrDoubleFeatureValue::UnseenDaysIndex()] = 0;
+    value[CtrDoubleFeatureValue::DeltaScoreIndex()] = 0;
+    *(double*)(value + CtrDoubleFeatureValue::ShowIndex()) = 0;
+    *(double*)(value + CtrDoubleFeatureValue::ClickIndex()) = 0;
+    value[CtrDoubleFeatureValue::SlotIndex()] = -1;
     _embed_sgd_rule->InitValue(
-        value + DownpourCtrDoubleFeatureValue::EmbedWIndex(),
-        value + DownpourCtrDoubleFeatureValue::EmbedG2SumIndex());
+        value + CtrDoubleFeatureValue::EmbedWIndex(),
+        value + CtrDoubleFeatureValue::EmbedG2SumIndex());
     _embedx_sgd_rule->InitValue(
-        value + DownpourCtrDoubleFeatureValue::EmbedxWIndex(),
-        value + DownpourCtrDoubleFeatureValue::EmbedxG2SumIndex(), false);
+        value + CtrDoubleFeatureValue::EmbedxWIndex(),
+        value + CtrDoubleFeatureValue::EmbedxG2SumIndex(), false);
   }
   return 0;
 }
-bool DownpourCtrDoubleAccessor::NeedExtendMF(float* value) {
-  auto show =
-      ((double*)(value + DownpourCtrDoubleFeatureValue::ShowIndex()))[0];
-  auto click =
-      ((double*)(value + DownpourCtrDoubleFeatureValue::ClickIndex()))[0];
+bool CtrDoubleAccessor::NeedExtendMF(float* value) {
+  auto show = ((double*)(value + CtrDoubleFeatureValue::ShowIndex()))[0];
+  auto click = ((double*)(value + CtrDoubleFeatureValue::ClickIndex()))[0];
   // float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff()
   auto score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() +
                click * _config.ctr_accessor_param().click_coeff();
   //+ click * _config.ctr_accessor_param().click_coeff();
   return score >= _config.embedx_threshold();
 }
-// from DownpourCtrFeatureValue to DownpourCtrPullValue
-int32_t DownpourCtrDoubleAccessor::Select(float** select_values,
-                                          const float** values, size_t num) {
+// from CtrDoubleFeatureValue to CtrDoublePullValue
+int32_t CtrDoubleAccessor::Select(float** select_values, const float** values,
+                                  size_t num) {
   auto embedx_dim = _config.embedx_dim();
   for (size_t value_item = 0; value_item < num; ++value_item) {
     float* select_value = select_values[value_item];
     float* value = const_cast<float*>(values[value_item]);
-    select_value[DownpourCtrDoublePullValue::ShowIndex()] =
-        (float)*(double*)(value + DownpourCtrDoubleFeatureValue::ShowIndex());
-    select_value[DownpourCtrDoublePullValue::ClickIndex()] =
-        (float)*(double*)(value + DownpourCtrDoubleFeatureValue::ClickIndex());
-    select_value[DownpourCtrDoublePullValue::EmbedWIndex()] =
-        value[DownpourCtrDoubleFeatureValue::EmbedWIndex()];
-    memcpy(select_value + DownpourCtrDoublePullValue::EmbedxWIndex(),
-           value + DownpourCtrDoubleFeatureValue::EmbedxWIndex(),
+    select_value[CtrDoublePullValue::ShowIndex()] =
+        (float)*(double*)(value + CtrDoubleFeatureValue::ShowIndex());
+    select_value[CtrDoublePullValue::ClickIndex()] =
+        (float)*(double*)(value + CtrDoubleFeatureValue::ClickIndex());
+    select_value[CtrDoublePullValue::EmbedWIndex()] =
+        value[CtrDoubleFeatureValue::EmbedWIndex()];
+    memcpy(select_value + CtrDoublePullValue::EmbedxWIndex(),
+           value + CtrDoubleFeatureValue::EmbedxWIndex(),
            embedx_dim * sizeof(float));
   }
   return 0;
 }
-// from DownpourCtrPushValue to DownpourCtrPushValue
+// from CtrDoublePushValue to CtrDoublePushValue
 // first dim: item
 // second dim: field num
-int32_t DownpourCtrDoubleAccessor::Merge(float** update_values,
-                                         const float** other_update_values,
-                                         size_t num) {
+int32_t CtrDoubleAccessor::Merge(float** update_values,
+                                 const float** other_update_values,
+                                 size_t num) {
   auto embedx_dim = _config.embedx_dim();
-  size_t total_dim = DownpourCtrDoublePushValue::Dim(embedx_dim);
+  size_t total_dim = CtrDoublePushValue::Dim(embedx_dim);
   for (size_t value_item = 0; value_item < num; ++value_item) {
     float* update_value = update_values[value_item];
     const float* other_update_value = other_update_values[value_item];
-    /**(double*)(update_value + DownpourCtrDoublePushValue::ShowIndex()) +=
-    *(double*)(other_update_value + DownpourCtrDoublePushValue::ShowIndex());
-    *(double*)(update_value + DownpourCtrDoublePushValue::ClickIndex()) +=
-    *(double*)(other_update_value + DownpourCtrDoublePushValue::ClickIndex());
+    /**(double*)(update_value + CtrDoublePushValue::ShowIndex()) +=
+    *(double*)(other_update_value + CtrDoublePushValue::ShowIndex());
+    *(double*)(update_value + CtrDoublePushValue::ClickIndex()) +=
+    *(double*)(other_update_value + CtrDoublePushValue::ClickIndex());
     for (auto i = 3u; i < total_dim; ++i) {
         update_value[i] += other_update_value[i];
     }*/
     for (auto i = 0u; i < total_dim; ++i) {
-      if (i != DownpourCtrDoublePushValue::SlotIndex()) {
+      if (i != CtrDoublePushValue::SlotIndex()) {
         update_value[i] += other_update_value[i];
       }
     }
   }
   return 0;
 }
-// from DownpourCtrPushValue to DownpourCtrFeatureValue
+// from CtrDoublePushValue to CtrDoubleFeatureValue
 // first dim: item
 // second dim: field num
-int32_t DownpourCtrDoubleAccessor::Update(float** update_values,
-                                          const float** push_values,
-                                          size_t num) {
+int32_t CtrDoubleAccessor::Update(float** update_values,
+                                  const float** push_values, size_t num) {
   auto embedx_dim = _config.embedx_dim();
   for (size_t value_item = 0; value_item < num; ++value_item) {
     float* update_value = update_values[value_item];
     const float* push_value = push_values[value_item];
-    float push_show = push_value[DownpourCtrDoublePushValue::ShowIndex()];
-    float push_click = push_value[DownpourCtrDoublePushValue::ClickIndex()];
-    float slot = push_value[DownpourCtrDoublePushValue::SlotIndex()];
-    *(double*)(update_value + DownpourCtrDoubleFeatureValue::ShowIndex()) +=
+    float push_show = push_value[CtrDoublePushValue::ShowIndex()];
+    float push_click = push_value[CtrDoublePushValue::ClickIndex()];
+    float slot = push_value[CtrDoublePushValue::SlotIndex()];
+    *(double*)(update_value + CtrDoubleFeatureValue::ShowIndex()) +=
         (double)push_show;
-    *(double*)(update_value + DownpourCtrDoubleFeatureValue::ClickIndex()) +=
+    *(double*)(update_value + CtrDoubleFeatureValue::ClickIndex()) +=
         (double)push_click;
-    update_value[DownpourCtrDoubleFeatureValue::SlotIndex()] = slot;
-    update_value[DownpourCtrDoubleFeatureValue::DeltaScoreIndex()] +=
+    update_value[CtrDoubleFeatureValue::SlotIndex()] = slot;
+    update_value[CtrDoubleFeatureValue::DeltaScoreIndex()] +=
         (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() +
         push_click * _config.ctr_accessor_param().click_coeff();
     //(push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() +
     // push_click * _config.ctr_accessor_param().click_coeff();
-    update_value[DownpourCtrDoubleFeatureValue::UnseenDaysIndex()] = 0;
+    update_value[CtrDoubleFeatureValue::UnseenDaysIndex()] = 0;
+    if (!_show_scale) {
+      push_show = 1;
+    }
+    VLOG(3) << "accessor show scale:" << _show_scale
+            << ", push_show:" << push_show;
     _embed_sgd_rule->UpdateValue(
-        update_value + DownpourCtrDoubleFeatureValue::EmbedWIndex(),
-        update_value + DownpourCtrDoubleFeatureValue::EmbedG2SumIndex(),
-        push_value + DownpourCtrDoublePushValue::EmbedGIndex(), push_show);
+        update_value + CtrDoubleFeatureValue::EmbedWIndex(),
+        update_value + CtrDoubleFeatureValue::EmbedG2SumIndex(),
+        push_value + CtrDoublePushValue::EmbedGIndex(), push_show);
     _embedx_sgd_rule->UpdateValue(
-        update_value + DownpourCtrDoubleFeatureValue::EmbedxWIndex(),
-        update_value + DownpourCtrDoubleFeatureValue::EmbedxG2SumIndex(),
-        push_value + DownpourCtrDoublePushValue::EmbedxGIndex(), push_show);
+        update_value + CtrDoubleFeatureValue::EmbedxWIndex(),
+        update_value + CtrDoubleFeatureValue::EmbedxG2SumIndex(),
+        push_value + CtrDoublePushValue::EmbedxGIndex(), push_show);
   }
   return 0;
 }
-bool DownpourCtrDoubleAccessor::CreateValue(int stage, const float* value) {
+bool CtrDoubleAccessor::CreateValue(int stage, const float* value) {
   // stage == 0, pull
   // stage == 1, push
   if (stage == 0) {
     return true;
   } else if (stage == 1) {
-    auto show = DownpourCtrDoublePushValue::Show(const_cast<float*>(value));
-    auto click = DownpourCtrDoublePushValue::Click(const_cast<float*>(value));
+    auto show = CtrDoublePushValue::Show(const_cast<float*>(value));
+    auto click = CtrDoublePushValue::Click(const_cast<float*>(value));
     auto score = ShowClickScore(show, click);
     if (score <= 0) {
       return false;
@@ -293,23 +298,22 @@ bool DownpourCtrDoubleAccessor::CreateValue(int stage, const float* value) {
     return true;
   }
 }
-double DownpourCtrDoubleAccessor::ShowClickScore(double show, double click) {
+double CtrDoubleAccessor::ShowClickScore(double show, double click) {
   // auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
   // auto click_coeff = _config.ctr_accessor_param().click_coeff();
   auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff();
   auto click_coeff = _config.ctr_accessor_param().click_coeff();
   return (show - click) * nonclk_coeff + click * click_coeff;
 }
-std::string DownpourCtrDoubleAccessor::ParseToString(const float* v,
-                                                     int param_size) {
+std::string CtrDoubleAccessor::ParseToString(const float* v, int param_size) {
   thread_local std::ostringstream os;
   os.clear();
   os.str("");
   os << v[0] << " " << v[1] << " " << (float)((double*)(v + 2))[0] << " "
      << (float)((double*)(v + 4))[0] << " " << v[6] << " " << v[7] << " "
      << v[8];
-  auto show = DownpourCtrDoubleFeatureValue::Show(const_cast<float*>(v));
-  auto click = DownpourCtrDoubleFeatureValue::Click(const_cast<float*>(v));
+  auto show = CtrDoubleFeatureValue::Show(const_cast<float*>(v));
+  auto click = CtrDoubleFeatureValue::Click(const_cast<float*>(v));
   auto score = ShowClickScore(show, click);
   if (score >= _config.embedx_threshold() && param_size > 9) {
     os << " " << v[9];
@@ -319,23 +323,22 @@ std::string DownpourCtrDoubleAccessor::ParseToString(const float* v,
   }
   return os.str();
 }
-int DownpourCtrDoubleAccessor::ParseFromString(const std::string& str,
-                                               float* value) {
+int CtrDoubleAccessor::ParseFromString(const std::string& str, float* value) {
   int embedx_dim = _config.embedx_dim();
   float data_buff[_accessor_info.dim + 2];
   float* data_buff_ptr = data_buff;
   _embedx_sgd_rule->InitValue(
-      data_buff_ptr + DownpourCtrDoubleFeatureValue::EmbedxWIndex(),
-      data_buff_ptr + DownpourCtrDoubleFeatureValue::EmbedxG2SumIndex());
+      data_buff_ptr + CtrDoubleFeatureValue::EmbedxWIndex(),
+      data_buff_ptr + CtrDoubleFeatureValue::EmbedxG2SumIndex());
   auto str_len = paddle::string::str_to_float(str.data(), data_buff_ptr);
   CHECK(str_len >= 6) << "expect more than 6 real:" << str_len;
-  int show_index = DownpourCtrDoubleFeatureValue::ShowIndex();
-  int click_index = DownpourCtrDoubleFeatureValue::ClickIndex();
-  int embed_w_index = DownpourCtrDoubleFeatureValue::EmbedWIndex();
+  int show_index = CtrDoubleFeatureValue::ShowIndex();
+  int click_index = CtrDoubleFeatureValue::ClickIndex();
+  int embed_w_index = CtrDoubleFeatureValue::EmbedWIndex();
   // no slot, embedx
   int value_dim = _accessor_info.dim;
-  int embedx_g2sum_index = DownpourCtrDoubleFeatureValue::EmbedxG2SumIndex();
-  value[DownpourCtrDoubleFeatureValue::SlotIndex()] = -1;
+  int embedx_g2sum_index = CtrDoubleFeatureValue::EmbedxG2SumIndex();
+  value[CtrDoubleFeatureValue::SlotIndex()] = -1;
   // other case
   if (str_len == (value_dim - 1)) {
     // copy unseen_days..delta_score
@@ -344,8 +347,8 @@ int DownpourCtrDoubleAccessor::ParseFromString(const std::string& str,
     *(double*)(value + show_index) = (double)data_buff_ptr[2];
     *(double*)(value + click_index) = (double)data_buff_ptr[3];
     // copy others
-    value[DownpourCtrDoubleFeatureValue::EmbedWIndex()] = data_buff_ptr[4];
-    value[DownpourCtrDoubleFeatureValue::EmbedG2SumIndex()] = data_buff_ptr[5];
+    value[CtrDoubleFeatureValue::EmbedWIndex()] = data_buff_ptr[4];
+    value[CtrDoubleFeatureValue::EmbedG2SumIndex()] = data_buff_ptr[5];
     memcpy(value + embedx_g2sum_index, data_buff_ptr + 6,
            (embedx_dim + 1) * sizeof(float));
   } else {
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
index 3995903463637..c58602065036f 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h
@@ -24,9 +24,9 @@
 namespace paddle {
 namespace distributed {
 
-class DownpourCtrDoubleAccessor : public ValueAccessor {
+class CtrDoubleAccessor : public ValueAccessor {
  public:
-  struct DownpourCtrDoubleFeatureValue {
+  struct CtrDoubleFeatureValue {
     /*
     float unseen_days;
     float delta_score;
@@ -45,60 +45,56 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
     }
     static int UnseenDaysIndex() { return 0; }
     static int DeltaScoreIndex() {
-      return DownpourCtrDoubleFeatureValue::UnseenDaysIndex() + 1;
+      return CtrDoubleFeatureValue::UnseenDaysIndex() + 1;
     }
     static int ShowIndex() {
-      return DownpourCtrDoubleFeatureValue::DeltaScoreIndex() + 1;
+      return CtrDoubleFeatureValue::DeltaScoreIndex() + 1;
     }
     // show is double
-    static int ClickIndex() {
-      return DownpourCtrDoubleFeatureValue::ShowIndex() + 2;
-    }
+    static int ClickIndex() { return CtrDoubleFeatureValue::ShowIndex() + 2; }
     // click is double
-    static int EmbedWIndex() {
-      return DownpourCtrDoubleFeatureValue::ClickIndex() + 2;
-    }
+    static int EmbedWIndex() { return CtrDoubleFeatureValue::ClickIndex() + 2; }
     static int EmbedG2SumIndex() {
-      return DownpourCtrDoubleFeatureValue::EmbedWIndex() + 1;
+      return CtrDoubleFeatureValue::EmbedWIndex() + 1;
     }
     static int SlotIndex() {
-      return DownpourCtrDoubleFeatureValue::EmbedG2SumIndex() + 1;
+      return CtrDoubleFeatureValue::EmbedG2SumIndex() + 1;
     }
     static int EmbedxG2SumIndex() {
-      return DownpourCtrDoubleFeatureValue::SlotIndex() + 1;
+      return CtrDoubleFeatureValue::SlotIndex() + 1;
     }
     static int EmbedxWIndex() {
-      return DownpourCtrDoubleFeatureValue::EmbedxG2SumIndex() + 1;
+      return CtrDoubleFeatureValue::EmbedxG2SumIndex() + 1;
     }
     static float& UnseenDays(float* val) {
-      return val[DownpourCtrDoubleFeatureValue::UnseenDaysIndex()];
+      return val[CtrDoubleFeatureValue::UnseenDaysIndex()];
     }
     static float& DeltaScore(float* val) {
-      return val[DownpourCtrDoubleFeatureValue::DeltaScoreIndex()];
+      return val[CtrDoubleFeatureValue::DeltaScoreIndex()];
     }
     static double& Show(float* val) {
-      return ((double*)(val + DownpourCtrDoubleFeatureValue::ShowIndex()))[0];
+      return ((double*)(val + CtrDoubleFeatureValue::ShowIndex()))[0];
     }
     static double& Click(float* val) {
-      return ((double*)(val + DownpourCtrDoubleFeatureValue::ClickIndex()))[0];
+      return ((double*)(val + CtrDoubleFeatureValue::ClickIndex()))[0];
     }
     static float& Slot(float* val) {
-      return val[DownpourCtrDoubleFeatureValue::SlotIndex()];
+      return val[CtrDoubleFeatureValue::SlotIndex()];
     }
     static float& EmbedW(float* val) {
-      return val[DownpourCtrDoubleFeatureValue::EmbedWIndex()];
+      return val[CtrDoubleFeatureValue::EmbedWIndex()];
     }
     static float& EmbedG2Sum(float* val) {
-      return val[DownpourCtrDoubleFeatureValue::EmbedG2SumIndex()];
+      return val[CtrDoubleFeatureValue::EmbedG2SumIndex()];
     }
     static float& EmbedxG2Sum(float* val) {
-      return val[DownpourCtrDoubleFeatureValue::EmbedxG2SumIndex()];
+      return val[CtrDoubleFeatureValue::EmbedxG2SumIndex()];
     }
     static float* EmbedxW(float* val) {
-      return (val + DownpourCtrDoubleFeatureValue::EmbedxWIndex());
+      return (val + CtrDoubleFeatureValue::EmbedxWIndex());
     }
   };
-  struct DownpourCtrDoublePushValue {
+  struct CtrDoublePushValue {
     /*
     float slot;
     float show;
@@ -110,35 +106,27 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
     static int DimSize(int dim, int embedx_dim) { return sizeof(float); }
     static int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); }
     static int SlotIndex() { return 0; }
-    static int ShowIndex() {
-      return DownpourCtrDoublePushValue::SlotIndex() + 1;
-    }
-    static int ClickIndex() {
-      return DownpourCtrDoublePushValue::ShowIndex() + 1;
-    }
-    static int EmbedGIndex() {
-      return DownpourCtrDoublePushValue::ClickIndex() + 1;
-    }
-    static int EmbedxGIndex() {
-      return DownpourCtrDoublePushValue::EmbedGIndex() + 1;
-    }
+    static int ShowIndex() { return CtrDoublePushValue::SlotIndex() + 1; }
+    static int ClickIndex() { return CtrDoublePushValue::ShowIndex() + 1; }
+    static int EmbedGIndex() { return CtrDoublePushValue::ClickIndex() + 1; }
+    static int EmbedxGIndex() { return CtrDoublePushValue::EmbedGIndex() + 1; }
     static float& Slot(float* val) {
-      return val[DownpourCtrDoublePushValue::SlotIndex()];
+      return val[CtrDoublePushValue::SlotIndex()];
     }
     static float& Show(float* val) {
-      return val[DownpourCtrDoublePushValue::ShowIndex()];
+      return val[CtrDoublePushValue::ShowIndex()];
     }
     static float& Click(float* val) {
-      return val[DownpourCtrDoublePushValue::ClickIndex()];
+      return val[CtrDoublePushValue::ClickIndex()];
     }
     static float& EmbedG(float* val) {
-      return val[DownpourCtrDoublePushValue::EmbedGIndex()];
+      return val[CtrDoublePushValue::EmbedGIndex()];
     }
     static float* EmbedxG(float* val) {
-      return val + DownpourCtrDoublePushValue::EmbedxGIndex();
+      return val + CtrDoublePushValue::EmbedxGIndex();
     }
   };
-  struct DownpourCtrDoublePullValue {
+  struct CtrDoublePullValue {
     /*
     float show;
     float click;
@@ -153,20 +141,20 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
     static int EmbedWIndex() { return 2; }
     static int EmbedxWIndex() { return 3; }
     static float& Show(float* val) {
-      return val[DownpourCtrDoublePullValue::ShowIndex()];
+      return val[CtrDoublePullValue::ShowIndex()];
     }
     static float& Click(float* val) {
-      return val[DownpourCtrDoublePullValue::ClickIndex()];
+      return val[CtrDoublePullValue::ClickIndex()];
     }
     static float& EmbedW(float* val) {
-      return val[DownpourCtrDoublePullValue::EmbedWIndex()];
+      return val[CtrDoublePullValue::EmbedWIndex()];
     }
     static float* EmbedxW(float* val) {
-      return val + DownpourCtrDoublePullValue::EmbedxWIndex();
+      return val + CtrDoublePullValue::EmbedxWIndex();
     }
   };
-  DownpourCtrDoubleAccessor() {}
-  virtual ~DownpourCtrDoubleAccessor() {}
+  CtrDoubleAccessor() {}
+  virtual ~CtrDoubleAccessor() {}
   virtual int Initialize();
   // 初始化AccessorInfo
   virtual void InitAccessorInfo();
@@ -182,7 +170,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
   // update delta_score and unseen_days after save
   virtual void UpdateStatAfterSave(float* value, int param) override;
   // 判断该value是否保存到ssd
-  virtual bool save_ssd(float* value);
+  virtual bool SaveSSD(float* value);
   // virtual bool save_cache(float* value, int param, double
   // global_cache_threshold) override;
   // keys不存在时，为values生成随机值
@@ -206,14 +194,14 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
   virtual float GetField(float* value, const std::string& name) override {
     CHECK(name == "show");
     if (name == "show") {
-      return (float)DownpourCtrDoubleFeatureValue::Show(value);
+      return (float)CtrDoubleFeatureValue::Show(value);
     }
     return 0.0;
   }
-  // DEFINE_GET_INDEX(DownpourCtrDoubleFeatureValue, show)
-  // DEFINE_GET_INDEX(DownpourCtrDoubleFeatureValue, click)
-  // DEFINE_GET_INDEX(DownpourCtrDoubleFeatureValue, embed_w)
-  // DEFINE_GET_INDEX(DownpourCtrDoubleFeatureValue, embedx_w)
+  // DEFINE_GET_INDEX(CtrDoubleFeatureValue, show)
+  // DEFINE_GET_INDEX(CtrDoubleFeatureValue, click)
+  // DEFINE_GET_INDEX(CtrDoubleFeatureValue, embed_w)
+  // DEFINE_GET_INDEX(CtrDoubleFeatureValue, embedx_w)
  private:
   double ShowClickScore(double show, double click);
 
@@ -222,6 +210,7 @@ class DownpourCtrDoubleAccessor : public ValueAccessor {
   SparseValueSGDRule* _embedx_sgd_rule;
   float _show_click_decay_rate;
   int32_t _ssd_unseenday_threshold;
+  bool _show_scale = false;
 };
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc
index 307abbdf51e4a..333008482f167 100644
--- a/paddle/fluid/distributed/ps/table/table.cc
+++ b/paddle/fluid/distributed/ps/table/table.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/distributed/ps/table/memory_dense_table.h"
 
 #include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_double_accessor.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h"
 #include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
 #include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
@@ -39,6 +40,7 @@ REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
 REGISTER_PSCORE_CLASS(Table, MemorySparseGeoTable);
 REGISTER_PSCORE_CLASS(ValueAccessor, CommMergeAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, CtrCommonAccessor);
+REGISTER_PSCORE_CLASS(ValueAccessor, CtrDoubleAccessor);
 REGISTER_PSCORE_CLASS(ValueAccessor, SparseAccessor);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
 REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
diff --git a/paddle/fluid/distributed/the_one_ps.proto b/paddle/fluid/distributed/the_one_ps.proto
index 197acc1824217..32bf9eaa5aa06 100644
--- a/paddle/fluid/distributed/the_one_ps.proto
+++ b/paddle/fluid/distributed/the_one_ps.proto
@@ -153,6 +153,7 @@ message CtrAccessorParameter {
                         // will be delete in shrink_model
   optional int32 ssd_unseenday_threshold = 9
       [ default = 1 ]; // threshold to save ssd
+  optional bool show_scale = 10 [ default = true ];
 }
 
 message TensorAccessorParameter {
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index ab8c28c33e78c..7769c5371baba 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -45,7 +45,7 @@
     'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
     'str' : 'std::string', \
-    'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'Place' : 'paddle::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index bd31de520750d..54c6e39283ec5 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -730,11 +730,10 @@ def GenerateNodeCreationCodes(self):
             is_optional = (name in optional_inputs)
 
             if is_fwd_input:
-                need_input_data = "false" if name in self.no_need_buffers else "true"
                 if is_optional:
                     set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, {need_input_data});"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, true);"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
             else:
                 if num_fwd_outputs > 1:
@@ -1472,9 +1471,6 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
 
         grad_node_name = GetGradNodeName(forward_api_name)
 
-        if len(grad_node_creation_str) == 0:
-            grad_node_creation_str = f"if(create_graph) VLOG(3) << \"Higher order grad node for {grad_node_name} has not been implemented yet.\";"
-
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
             grad_node_name, fill_zero_str, get_grad_in_args_str, grad_node_name,
             grad_function_call_str, get_outputs_str, inputs_autograd_meta_str,
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 8075b65b1945b..7ca5fc833ea8d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -46,7 +46,7 @@ def SkipAPIGeneration(forward_api_name):
     "std::vector<std::string>": "CastPyArg2Strings",
     "paddle::experimental::Scalar": "CastPyArg2Scalar",
     "paddle::experimental::IntArray": "CastPyArg2IntArray",
-    "paddle::experimental::Place": "CastPyArg2Place",
+    "paddle::Place": "CastPyArg2Place",
     "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
@@ -100,6 +100,9 @@ def FindParsingFunctionFromAttributeType(atype):
 {}
 
     tstate = PyEval_SaveThread();
+
+    // Set Device ID
+{}
     
     auto out = {}({});
     
@@ -118,6 +121,19 @@ def FindParsingFunctionFromAttributeType(atype):
 
 """
 
+FUNCTION_SET_DEVICE_TEMPLATE = \
+"""
+    {}
+    if (paddle::platform::is_gpu_place(place)) {{
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      phi::backends::gpu::SetDeviceId(place.device);
+      VLOG(1) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
+#else
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU if use CUDAPlace."));
+#endif
+    }}
+"""
 
 FUNCTION_NAME_TEMPLATE = \
 "{}{}{}"
@@ -293,14 +309,23 @@ def GeneratePythonCFunction(self):
                         "false")
 
         parse_attributes_str = ""
+        expected_place_str = "auto place = egr::Controller::Instance().GetExpectedPlace();\n"
 
         # Generate Python-C Attributes Parsing Logic
         for name, atype, _, pos in orig_forward_attrs_list:
             parsing_function_name = FindParsingFunctionFromAttributeType(atype)
+            # Used input argument place if specified from Python frontend.
+            if len(expected_place_str
+                   ) != 0 and parsing_function_name == "CastPyArg2Place":
+                expected_place_str = ""
+                assert name == "place", "Only support 'place' as template argument name in FUNCTION_SET_DEVICE_TEMPLATE."
+
             parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
                 name, pos, atype, name, parsing_function_name, name,
                 forward_api_name, pos)
 
+        set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)
+
         # Generate Dygraph Function Call Logic
         num_args = len(forward_inputs_position_map.keys()) + len(
             orig_forward_attrs_list)
@@ -326,8 +351,8 @@ def GeneratePythonCFunction(self):
             "pythonc_record_event", forward_api_name, "pybind_imperative_func")
         self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
             forward_api_name, pythonc_record_event_str, forward_api_name,
-            get_eager_tensor_str, parse_attributes_str, fwd_function_name,
-            dygraph_function_call_str, return_str)
+            get_eager_tensor_str, parse_attributes_str, set_device_str,
+            fwd_function_name, dygraph_function_call_str, return_str)
 
         # Set prefix of forward_api_name to avoid conflicts
         prefix = self.namespace.strip("::")
@@ -361,8 +386,9 @@ def GeneratePythonCFunction(self):
             self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
                 inplaced_forward_api_name, pythonc_record_event_str,
                 inplaced_forward_api_name, get_eager_tensor_str,
-                parse_attributes_str, inplaced_fwd_function_name,
-                dygraph_function_call_str, return_str)
+                parse_attributes_str, set_device_str,
+                inplaced_fwd_function_name, dygraph_function_call_str,
+                return_str)
 
             # Generate Python-C Function Registration
             self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 6db606edf6f4c..a1df822265309 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -766,7 +766,8 @@ std::vector<paddle::experimental::Tensor> RunBackward(
                 << ", rank: " << edge_rank.second;
 
         node_input_buffers_dict[next_node]->add(
-            edge_rank.first, edge_rank.second, grad_output_tensor);
+            edge_rank.first, edge_rank.second, grad_output_tensor,
+            create_graph);
 
         // Update queue
         node_in_degree_map[next_node]--;
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 27a8c6002e29d..64fb8b53b473c 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -72,7 +72,8 @@ void GradTensorHolder::CopyValueFromTensor(
 }
 
 void GradTensorHolder::add(size_t slot_id, size_t rank,
-                           const paddle::experimental::Tensor& t) {
+                           const paddle::experimental::Tensor& t,
+                           bool create_graph) {
   // TODO(jiabin): We need to deal with empty input_buffer with slot size not
   // empty;
   PADDLE_ENFORCE(slot_id < buffer_.size(),
@@ -113,8 +114,12 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
 
     if (t.is_dense_tensor()) {
       if (buffer_tensor.is_dense_tensor()) {
-        buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor);
-
+        if (create_graph) {
+          buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor);
+        } else {
+          paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
+              t, &buffer_tensor);
+        }
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with
diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h
index a4f2507728c64..80b7c59df8fa0 100644
--- a/paddle/fluid/eager/grad_tensor_holder.h
+++ b/paddle/fluid/eager/grad_tensor_holder.h
@@ -45,7 +45,8 @@ class GradTensorHolder {
   GradTensorHolder& operator=(const GradTensorHolder& other) = default;
 
   // Create new tensor and copy tensor->impl
-  void add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t);
+  void add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t,
+           bool create_graph = false);
   void CopyValueFromTensor(size_t slot_id, size_t rank,
                            const paddle::experimental::Tensor& t,
                            bool fill_one = false);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 9afe3962faa29..a9a50a3621767 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -16,7 +16,9 @@
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
@@ -206,3 +208,28 @@ TEST(EagerVariable, Constructor) {
 
   VLOG(6) << "Finish";
 }
+
+TEST(EagerVariable, DataLayout) {
+  paddle::experimental::Tensor tensor;
+  phi::DenseTensorMeta meta =
+      phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1, 1, 1}),
+                           paddle::experimental::DataLayout::UNDEFINED);
+  std::shared_ptr<phi::DenseTensor> dt = std::make_shared<phi::DenseTensor>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
+      meta);
+  auto* dt_ptr = dt->mutable_data<float>(paddle::platform::CPUPlace());
+  dt_ptr[0] = 5.0f;
+  dt_ptr[1] = 5.0f;
+  dt_ptr[2] = 5.0f;
+  dt_ptr[3] = 5.0f;
+  tensor.set_impl(dt);
+  auto eager_var = std::make_shared<egr::EagerVariable>(tensor);
+  auto layout = paddle::imperative::GetDataLayout(eager_var);
+  CHECK_EQ(layout, paddle::experimental::DataLayout::UNDEFINED);
+  paddle::imperative::SetDataLayout(eager_var,
+                                    paddle::experimental::DataLayout::NCHW);
+  layout = paddle::imperative::GetDataLayout(eager_var);
+  CHECK_EQ(layout, paddle::experimental::DataLayout::NCHW);
+}
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 5e790389819f5..287d6e770dea2 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -42,7 +42,7 @@ using namespace egr_utils_api;  // NOLINT
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index a3e393b039425..d9afd7cc96523 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -41,7 +41,7 @@
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
index d2bef100ca2b5..2d69380cf78d9 100644
--- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc
@@ -36,7 +36,7 @@ PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace egr {
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index fbcd920905c9d..3f28b2e8c7398 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/api/all.h"
@@ -160,7 +161,18 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                             "Input tensor (%s) is not initialized.", in_name));
       paddle::experimental::Tensor custom_in;
       custom_in.set_impl(std::make_shared<phi::DenseTensor>(*x));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      if (custom_in.is_gpu_pinned()) {
+        VLOG(3) << "Custom Operator: custom input is gpu pinned tensor";
+        auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId());
+        auto custom_gpu_in = custom_in.copy_to(gpu_place, true);
+        kernel_ctx.EmplaceBackInput(std::move(custom_gpu_in));
+      } else {
+        kernel_ctx.EmplaceBackInput(std::move(custom_in));
+      }
+#else
       kernel_ctx.EmplaceBackInput(std::move(custom_in));
+#endif
     }
   }
 
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 589d09bf81c1d..1a4f283f511da 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -34,14 +34,6 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
     return;
   }
 
-  // NOTE(hqp): Special case for CPU->MLU, avoid stream sync.
-  if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) {
-    paddle::framework::TensorCopy(
-        in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place),
-        out);
-    return;
-  }
-
   // NOTE(yy): TransDataDevice should wait for computation of input.
   if (!platform::is_cuda_pinned_place(in.place())) {
     platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index e8f84caafcea2..dd3df2460f00e 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -134,15 +134,6 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
                                     const Tensor& in, Tensor* out,
                                     platform::Place place, bool always_copy) {
-  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::undef,
-                    platform::errors::InvalidArgument(
-                        "Input tensor format is invalid. Input tensor should "
-                        "have specified memory format."));
-  PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any,
-                    platform::errors::InvalidArgument(
-                        "Input tensor format is invalid. Input tensor should "
-                        "have specified memory format."));
-
   // Set default as NCHW in case not specified
   out_layout =
       out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
@@ -162,22 +153,24 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
           "Input tensor type (%s) is not supported.",
           DataTypeToString(framework::TransToProtoVarType(in.dtype()))));
 
-  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
       platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+  dnnl::memory::desc out_mem_desc(out_tz, in_type, out_format);
 
   // output tensor has the same dims as input. Reorder don't change dims
+  out->set_mem_desc(out_mem_desc);
   out->Resize(in.dims());
 
-  if ((in_format != out_format) || always_copy) {
+  if ((in.mem_desc() != out->mem_desc()) || always_copy) {
     void* in_data = GetDataFromTensor(in, in_type);
 
     platform::ReorderMKLDNNHandler handler(
         in_tz, framework::TransToProtoVarType(in.dtype()), in_type, cpu_engine);
 
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
+    auto reorder_src_memory_p =
+        handler.AcquireSrcMemory(in.mem_desc(), in_data);
     auto reorder_dst_memory_p =
-        handler.AcquireDstMemory(out, out_format, place);
+        handler.AcquireDstMemory(out, out->mem_desc(), place);
     auto reorder_p =
         handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
 
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 2c795c946235f..63e289af45209 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -70,8 +70,10 @@ void TransformData(const OpKernelType &expected_kernel_type,
           paddle::platform::MKLDNNDeviceContext::tls()
               .set_cur_paddle_data_layout(lin);
         }
-        out.set_layout(DataLayout::kMKLDNN);
-        out.set_format(out_format);
+        dnnl::memory::desc out_mem_desc(
+            vectorize(out.dims()),
+            ToMKLDNNDataType(TransToProtoVarType(in.type())), out_format);
+        out.set_mem_desc(out_mem_desc);
       } else {
         // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
         // Do transform via MKLDNN lib
@@ -121,8 +123,9 @@ void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
 #ifdef PADDLE_WITH_MKLDNN
-    tran_lod_tensor->set_format(in_lod_tensor.format());
+    tran_lod_tensor->set_mem_desc(in_lod_tensor.mem_desc());
 #endif
+    tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<phi::SelectedRows>()) {
     auto &in_selected_rows = in_var.Get<phi::SelectedRows>();
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 9b0a033856d73..fff78dd872c99 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -258,6 +258,7 @@ message CtrAccessorParameter {
       [ default = 0.8 ]; // threshold to shrink a feasign
   optional float delete_after_unseen_days = 8 [ default = 30 ];
   optional int32 ssd_unseenday_threshold = 9 [ default = 1 ];
+  optional bool show_scale = 10 [ default = true ];
 }
 
 message TableAccessorSaveParameter {
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index c3304e3f9021d..2e9104f40cc60 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -12,15 +12,19 @@ else()
 endif(WITH_PSLIB)
 
 if(WITH_HETERPS)
-    if(WITH_NCCL)
+    if(WITH_NCCL AND WITH_GPU)
         nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
         add_subdirectory(heter_ps)
+    elseif(WITH_XPU_KP)
+        xpu_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.kps ps_gpu_wrapper.cc
+        DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+        add_subdirectory(heter_ps)
     elseif(WITH_RCCL)
         hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
         add_subdirectory(heter_ps)
-    endif(WITH_NCCL)
+    endif()
 else()
     cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc DEPS gloo_wrapper)
 endif(WITH_HETERPS)
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index e90d864fa1ab7..70b067b0494f1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -13,16 +13,28 @@ IF(WITH_GPU)
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)
-        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table)
+        nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table hashtable_kernel)
         nv_library(graph_sampler SRCS graph_sampler_inl.h DEPS graph_gpu_ps)
-    #nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
-    #nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps)
-    #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS graph_gpu_ps)
-    # ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
-    # target_link_libraries(test_sample_rate graph_gpu_ps graph_sampler)
-    # nv_test(test_graph_xx SRCS test_xx.cu DEPS graph_gpu_ps graph_sampler)
-    endif()
 
+        nv_test(test_cpu_query SRCS test_cpu_query.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+        #ADD_EXECUTABLE(test_sample_rate test_sample_rate.cu)
+        #target_link_libraries(test_sample_rate heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+        #nv_test(test_sample_rate SRCS test_sample_rate.cu DEPS heter_comm table heter_comm_kernel hashtable_kernel heter_ps ${HETERPS_DEPS})
+        #ADD_EXECUTABLE(test_cpu_query test_cpu_query.cu)
+        #target_link_libraries(test_cpu_query graph_gpu_ps)
+    endif()
+ENDIF()
+IF(WITH_XPU_KP)
+    SET(HETERPS_DEPS device_context)
+    xpu_library(heter_comm_kernel SRCS heter_comm_kernel.h heter_comm_kernel.kps feature_value.h)
+    xpu_library(hashtable_kernel SRCS hashtable.h hashtable_kernel.kps)
+    cc_library(heter_comm SRCS heter_comm.h heter_resource.cc DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
+    # Change heter_ps.cu file suffix
+    # NOTE(zhangminxu): If we compile with XPU_KP, we directly copy heter_ps.cu to heter_ps.cc
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/heter_ps.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/)
+    file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/heter_ps.cu ${CMAKE_CURRENT_BINARY_DIR}/heter_ps.cc)
+    cc_library(heter_ps SRCS heter_ps.cc DEPS heter_comm)
+    # xpu_library(heter_comm SRCS heter_comm.h heter_comm_kernel.kps feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
index 266508eb4de6c..4ad32d1714f7d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
@@ -551,6 +551,7 @@ class concurrent_unordered_map : public managed {
         update_existing_value(existing_value, x, op);
 
         insert_success = true;
+        break;
       }
 
       current_index = (current_index + 1) % hashtbl_size;
diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
index 27f14e8726d9c..5b8a20f7b9970 100644
--- a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
+++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h
@@ -16,6 +16,7 @@
 #ifdef PADDLE_WITH_HETERPS
 #include <iostream>
 #include <memory>
+#include <string>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
@@ -41,6 +42,24 @@ struct GpuPsCommGraph {
         node_list(node_list_),
         neighbor_size(neighbor_size_),
         node_size(node_size_) {}
+  void display_on_cpu() {
+    VLOG(0) << "neighbor_size = " << neighbor_size;
+    VLOG(0) << "node_size = " << node_size;
+    for (int i = 0; i < neighbor_size; i++) {
+      VLOG(0) << "neighbor " << i << " " << neighbor_list[i];
+    }
+    for (int i = 0; i < node_size; i++) {
+      VLOG(0) << "node i " << node_list[i].node_id
+              << " neighbor_size = " << node_list[i].neighbor_size;
+      std::string str;
+      int offset = node_list[i].neighbor_offset;
+      for (int j = 0; j < node_list[i].neighbor_size; j++) {
+        if (j > 0) str += ",";
+        str += std::to_string(neighbor_list[j + offset]);
+      }
+      VLOG(0) << str;
+    }
+  }
 };
 
 /*
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index cd55d09608f54..4eb42d80a00b5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -18,6 +18,7 @@
 #include "heter_comm.h"
 #include "paddle/fluid/distributed/ps/table/common_graph_table.h"
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
 #include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_HETERPS
 namespace paddle {
@@ -28,10 +29,10 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
       : HeterComm<int64_t, int, int>(1, resource) {
     load_factor_ = 0.25;
     rw_lock.reset(new pthread_rwlock_t());
-    gpu_num = resource_->total_gpu();
+    gpu_num = resource_->total_device();
     cpu_table_status = -1;
     if (topo_aware) {
-      int total_gpu = resource_->total_gpu();
+      int total_gpu = resource_->total_device();
       std::map<int, int> device_map;
       for (int i = 0; i < total_gpu; i++) {
         device_map[resource_->dev_id(i)] = i;
@@ -62,7 +63,7 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
             node.key_storage = NULL;
             node.val_storage = NULL;
             node.sync = 0;
-            node.gpu_num = transfer_id;
+            node.dev_num = transfer_id;
           }
           nodes.push_back(Node());
           Node &node = nodes.back();
@@ -71,7 +72,7 @@ class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
           node.key_storage = NULL;
           node.val_storage = NULL;
           node.sync = 0;
-          node.gpu_num = j;
+          node.dev_num = j;
         }
       }
     }
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
index c235378def51f..37067dc36543c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -28,14 +28,16 @@ sample_result is to save the neighbor sampling result, its size is len *
 sample_size;
 
 */
-
 __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* node_index,
                                         int* actual_size, int64_t* res,
                                         int sample_len, int* sample_status,
                                         int n, int from) {
-  //  printf("%d %d %d\n",blockIdx.x,threadIdx.x,threadIdx.y);
   int id = blockIdx.x * blockDim.y + threadIdx.y;
   if (id < n) {
+    if (node_index[id] == -1) {
+      actual_size[id] = 0;
+      return;
+    }
     curandState rng;
     curand_init(blockIdx.x, threadIdx.x, threadIdx.y, &rng);
     int index = threadIdx.x;
@@ -305,7 +307,6 @@ __global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   if (i < len) {
     d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
-    // d_vals[idx[i]] = d_shard_vals[i];
     for (int j = 0; j < sample_size; j++) {
       d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
     }
@@ -351,7 +352,7 @@ void GpuPsGraphTable::build_graph_from_cpu(
   VLOG(0) << "in build_graph_from_cpu cpu_graph_list size = "
           << cpu_graph_list.size();
   PADDLE_ENFORCE_EQ(
-      cpu_graph_list.size(), resource_->total_gpu(),
+      cpu_graph_list.size(), resource_->total_device(),
       platform::errors::InvalidArgument("the cpu node list size doesn't match "
                                         "the number of gpu on your machine."));
   clear_graph_info();
@@ -378,6 +379,7 @@ void GpuPsGraphTable::build_graph_from_cpu(
       build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
       gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
     } else {
+      build_ps(i, NULL, NULL, 0, 1024, 8);
       gpu_graph_list[i].node_list = NULL;
       gpu_graph_list[i].node_size = 0;
     }
@@ -442,7 +444,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
   // cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
   int* actual_sample_size = result->actual_sample_size;
   int64_t* val = result->val;
-  int total_gpu = resource_->total_gpu();
+  int total_gpu = resource_->total_device();
   // int dev_id = resource_->dev_id(gpu_id);
   auto stream = resource_->local_stream(gpu_id, 0);
 
@@ -472,9 +474,11 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
 
   split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
 
-  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr, key,
-                                                        d_idx_ptr, len);
-
+  // fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr,
+  // key,
+  //                                                     d_idx_ptr, len);
+  heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, key, d_idx_ptr, len,
+                                     stream);
   cudaStreamSynchronize(stream);
 
   cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
@@ -510,6 +514,9 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     */
     create_storage(gpu_id, i, shard_len * sizeof(int64_t),
                    shard_len * (1 + sample_size) * sizeof(int64_t));
+    auto& node = path_[gpu_id][i].nodes_[0];
+    cudaMemsetAsync(node.val_storage, -1, shard_len * sizeof(int),
+                    node.in_stream);
   }
   // auto end1 = std::chrono::steady_clock::now();
   // auto tt = std::chrono::duration_cast<std::chrono::microseconds>(end1 -
@@ -532,7 +539,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, gpu_id));
     // node.in_stream);
-    auto shard_len = h_right[i] - h_left[i] + 1;
+    int shard_len = h_right[i] - h_left[i] + 1;
     auto graph = gpu_graph_list[i];
     int* id_array = reinterpret_cast<int*>(node.val_storage);
     int* actual_size_array = id_array + shard_len;
@@ -595,20 +602,13 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
     // auto& node = path_[gpu_id][i].nodes_.back();
     // cudaStreamSynchronize(node.in_stream);
     cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
-    // tables_[i]->rwlock_->UNLock();
   }
-  // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
   move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
                                             h_left, h_right, d_shard_vals_ptr,
                                             d_shard_actual_sample_size_ptr);
-
   fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
       d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
       d_idx_ptr, sample_size, len);
-  // cudaStreamSynchronize(stream);
-  // auto end2 = std::chrono::steady_clock::now();
-  // tt = std::chrono::duration_cast<std::chrono::microseconds>(end2 - end1);
-  // VLOG(0)<< "sample graph time  " << tt.count() << " us";
   for (int i = 0; i < total_gpu; ++i) {
     int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index cac1b9c17e077..fc54be447fe17 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -297,12 +297,17 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 }
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+template class HashTable<long, int>;
 
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
     cudaStream_t>(const unsigned long* d_keys,
                   paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
+                                                      int* d_vals, size_t len,
+                                                      cudaStream_t stream);
+
 // template void
 // HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
@@ -313,6 +318,11 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
                   const paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
+                                                         const int* d_vals,
+                                                         size_t len,
+                                                         cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::insert<
 //    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
index 55edf883271b9..e879d817b14dd 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
@@ -48,7 +48,7 @@ __device__ void update_lr(float& w, float& g2sum, float g,  // NOLINT
   GM2LM(optimizer_config::learning_rate, &local_learning_rate, sizeof(float));
   GM2LM(optimizer_config::initial_g2sum, &local_initial_g2sum, sizeof(float));
   GM2LM(optimizer_config::min_bound, &local_min_bound, sizeof(float));
-  GM2LM(optimizr_config::max_bound, &local_max_bound, sizeof(float));
+  GM2LM(optimizer_config::max_bound, &local_max_bound, sizeof(float));
 
   double add_g2sum = 0;
   double ratio = local_learning_rate *
@@ -136,7 +136,7 @@ __device__ void update_value(ValType& val, const GradType& grad) {  // NOLINT
 
 template <typename KeyType, typename ValType, typename Table>
 __global__ void insert_kernel(Table* table, const KeyType* const keys,
-                              const ValType* const vals, size_t len) {
+                              const ValType* const vals, long long len) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -164,7 +164,7 @@ __global__ void insert_kernel(Table* table, const KeyType* const keys,
 
 template <typename KeyType, typename ValType, typename Table>
 __global__ void search_kernel(Table* table, const KeyType* const keys,
-                              ValType* const vals, size_t len) {
+                              ValType* const vals, long long len) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -194,7 +194,7 @@ __global__ void search_kernel(Table* table, const KeyType* const keys,
 
 template <typename KeyType, typename ValType, typename Table, typename GradType>
 __global__ void update_kernel(Table* table, const KeyType* const keys,
-                              const GradType* const grads, size_t len) {
+                              const GradType* const grads, long long len) {
   int cid = core_id();
   int ncores = core_num();
   if (cid >= ncores) {
@@ -251,7 +251,10 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
   if (len == 0) {
     return;
   }
-  search_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len);
+  long long c_len = (long long)len;
+  search_kernel<KeyType, ValType,
+                XPUCacheArray<KeyType, ValType>><<<4, 64, stream>>>(
+      container_, d_keys, d_vals, c_len);
 }
 
 template <typename KeyType, typename ValType>
@@ -272,7 +275,10 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
   if (len == 0) {
     return;
   }
-  insert_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len);
+  long long c_len = (long long)len;
+  insert_kernel<KeyType, ValType,
+                XPUCacheArray<KeyType, ValType>><<<4, 64, stream>>>(
+      container_, d_keys, d_vals, c_len);
 }
 
 template <typename KeyType, typename ValType>
@@ -289,7 +295,10 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
   if (len == 0) {
     return;
   }
-  update_kernel<<<4, 64, stream>>>(container_, d_keys, d_grads, len);
+  long long c_len = (long long)len;
+  update_kernel<KeyType, ValType, XPUCacheArray<KeyType, ValType>,
+                GradType><<<4, 64, stream>>>(container_, d_keys, d_grads,
+                                             c_len);
 }
 
 template <typename KeyType, typename ValType>
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 5e4be02962ea9..338009250bc4f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -153,11 +153,13 @@ class HeterComm {
 
 #if defined(PADDLE_WITH_CUDA)
     platform::CUDAPlace place_;
+
 #elif defined(PADDLE_WITH_XPU_KP)
     platform::XPUPlace place_;
 #endif
     std::shared_ptr<memory::Allocation> all_keys_mem;
     std::shared_ptr<memory::Allocation> all_grads_mem;
+
     KeyType* all_keys;
     GradType* all_grads;
 
@@ -210,10 +212,10 @@ class HeterComm {
   std::vector<std::vector<Path>> path_;
   float load_factor_{0.75};
   int block_size_{256};
-  int topo_aware_{0};
+  std::unique_ptr<HeterCommKernel> heter_comm_kernel_;
 
  private:
-  std::unique_ptr<HeterCommKernel> heter_comm_kernel_;
+  int topo_aware_{0};
   std::vector<LocalStorage> storage_;
   int feanum_{1800 * 2048};
   int multi_node_{0};
@@ -228,5 +230,7 @@ class HeterComm {
 
 }  // end namespace framework
 }  // end namespace paddle
+
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h"
+
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 1e66b3cb25031..551b5c38895a9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -411,7 +411,6 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(
 
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
-
   auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType));
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
@@ -1035,7 +1034,6 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   merge_grad(gpu_num, storage.local_keys, storage.local_grads, merge_num, ret);
   return ret;
 }
-
 #endif
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -1065,7 +1063,6 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
 //  platform::CUDADeviceGuard guard(dev_id);
 //  tables_[index]->dump_to_cpu(dev_id, stream);
 //}
-
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index 694bdb8d563f5..bdeb696a92bce 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -218,6 +218,14 @@ template void HeterCommKernel::calc_shard_index<
                                       int* shard_index, int total_devs,
                                       const cudaStream_t& stream);
 
+template void HeterCommKernel::calc_shard_index<long, int, cudaStream_t>(
+    long* d_keys, long long len, int* shard_index, int total_devs,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_shard_key<long, int, cudaStream_t>(
+    long* d_shard_keys, long* d_keys, int* idx, long long len,
+    const cudaStream_t& stream);
+
 template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
     unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len,
     const cudaStream_t& stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
index 1be3687a7dbee..9d2ee5d272c72 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -41,6 +41,7 @@ class HeterCommKernel {
 
   template <typename KeyType, typename T, typename StreamType>
   void calc_shard_index(KeyType* d_keys, long long len, T* shard_index,
+
                         int total_devs, const StreamType& stream);
 
   template <typename KeyType, typename T, typename StreamType>
@@ -62,6 +63,7 @@ class HeterCommKernel {
                   const KeyT* d_keys_in, KeyT* d_keys_out,
                   const ValueT* d_values_in, ValueT* d_values_out,
                   int num_items, int begin_bit = 0,
+
                   int end_bit = sizeof(KeyT) * 8, StreamType stream = NULL,
                   bool debug_synchronous = false);
 
@@ -75,6 +77,7 @@ class HeterCommKernel {
                      ValuesInputIteratorT d_values_in,
                      AggregatesOutputIteratorT d_aggregates_out,
                      NumRunsOutputIteratorT d_num_runs_out, int num_items,
+
                      StreamType stream = NULL, bool debug_synchronous = false);
 
  private:
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
index a1923a7f6019b..f73757902fef6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
@@ -233,8 +233,6 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
   }
 }
 
-// xpu implementation of heter_comm_kernel.h
-
 template <typename T, typename StreamType>
 void HeterCommKernel::fill_idx(T* idx, long long len,
                                const StreamType& stream) {
@@ -291,17 +289,21 @@ void HeterCommKernel::sort_pairs(void* d_temp_storage,
                                  bool debug_synchronous) {}
 
 template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
-          void HeterCommKernel::reduce_by_key(
-              void* d_temp_storage,
-              size_t& temp_storage_bytes,  // NOLINT
-              KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out,
-              ValuesInputIteratorT d_values_in,
-              AggregatesOutputIteratorT d_aggregates_out,
-              NumRunsOutputIteratorT d_num_runs_out, int num_items,
-              StreamType stream, bool debug_synchronous) {}
+          typename ValuesInputIteratorT, typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT, typename StreamType>
+void HeterCommKernel::reduce_by_key(void* d_temp_storage,
+                                    size_t& temp_storage_bytes,  // NOLINT
+                                    KeysInputIteratorT d_keys_in,
+                                    UniqueOutputIteratorT d_unique_out,
+                                    ValuesInputIteratorT d_values_in,
+                                    AggregatesOutputIteratorT d_aggregates_out,
+                                    NumRunsOutputIteratorT d_num_runs_out,
+                                    int num_items, StreamType stream,
+                                    bool debug_synchronous) {}
 
 template void HeterCommKernel::fill_idx<int, XPUStream>(
     int* idx, long long len, const XPUStream& stream);
+
 template void HeterCommKernel::calc_shard_offset<int, XPUStream>(
     int* idx, int* left, int* right, long long len, int total_devs,
     const XPUStream& stream);
@@ -312,12 +314,14 @@ template void HeterCommKernel::calc_shard_index<unsigned long, int, XPUStream>(
 template void HeterCommKernel::fill_shard_key<unsigned long, int, XPUStream>(
     unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len,
     const XPUStream& stream);
+
 template void HeterCommKernel::fill_shard_grads<
     unsigned long, paddle::framework::FeaturePushValue, int, XPUStream>(
     unsigned long* d_shard_keys, unsigned long* d_keys,
     paddle::framework::FeaturePushValue* d_shard_grads,
     paddle::framework::FeaturePushValue* d_grads, int* idx, long long len,
     const XPUStream& stream);
+
 template void
 HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, XPUStream>(
     paddle::framework::FeatureValue* d_shard_vals,
@@ -348,9 +352,8 @@ template void HeterCommKernel::reduce_by_key<
     size_t& temp_storage_bytes,  // NOLINT
     unsigned long* d_keys_in, unsigned long* d_unique_out,
     paddle::framework::FeaturePushValue* d_values_in,
-    paddle::framework::FeaturePushValue* d_aggregates_out,
-    int* d_num_runs_out int num_items, XPUStream stream,
-    bool debug_synchronous);
+    paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out,
+    int num_items, XPUStream stream, bool debug_synchronous);
 
 #endif
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index 7074cfb521bdf..b330c9bb9f5ef 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -69,6 +69,7 @@ XPUResource::XPUResource(std::vector<int>& dev_ids, int index) {
 
   platform::XPUDeviceGuard guard(dev_id_);
   local_streams_.resize(dev_ids_.size());
+
   comm_streams_.resize(dev_ids_.size(), NULL);
   remote_streams_.resize(dev_ids_.size());
 
@@ -84,6 +85,7 @@ XPUResource::~XPUResource() {
   for (size_t i = 0; i < local_streams_.size(); ++i) {
     PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(local_streams_[i]));
   }
+
   // for (size_t i = 0; i < comm_streams_.size(); ++i) {
   //  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(comm_streams_[i]));
   // }
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 164fca2276800..17bc12a5af1a7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -36,6 +36,7 @@ namespace framework {
 
 #if defined(PADDLE_WITH_CUDA)
 using ppStream = cudaStream_t;
+
 #elif defined(PADDLE_WITH_XPU_KP)
 using ppStream = XPUStream;
 #endif
@@ -61,6 +62,7 @@ class GPUResource {
   std::vector<gpuStream_t> local_streams_;
   std::vector<gpuStream_t> comm_streams_;
 };
+
 #elif defined(PADDLE_WITH_XPU_KP)
 class XPUResource {
  public:
@@ -105,6 +107,7 @@ class HeterPsResource {
   int get_index_by_devid(int devid);
   int dev_id(int num);
   void set_multi_mf(int multi_mf_dim, int max_mf_dim);
+
   ppStream local_stream(int dev_num, int stream_num);
   ppStream remote_stream(int dev_num, int stream_num);
   ppStream comm_stream(int dev_num, int stream_num);
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
new file mode 100644
index 0000000000000..d812542f17ba0
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_query.cu
@@ -0,0 +1,82 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+namespace platform = paddle::platform;
+// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph
+// paddle::framework::GpuPsCommGraph GraphTable::make_gpu_ps_graph(
+//     std::vector<int64_t> ids)
+TEST(TEST_FLEET, test_cpu_cache) {
+  int gpu_num = 0;
+  int st = 0, u = 0;
+  std::vector<int> device_id_mapping;
+  for (int i = 0; i < 2; i++) device_id_mapping.push_back(i);
+  gpu_num = device_id_mapping.size();
+  ::paddle::distributed::GraphParameter table_proto;
+  table_proto.set_shard_num(24);
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(device_id_mapping);
+  resource->enable_p2p();
+  int use_nv = 1;
+  GpuPsGraphTable g(resource, use_nv);
+  g.init_cpu_table(table_proto);
+  std::vector<paddle::framework::GpuPsCommGraph> vec;
+  int n = 10;
+  std::vector<int64_t> ids0, ids1;
+  for (int i = 0; i < n; i++) {
+    g.cpu_graph_table->add_comm_edge(i, (i + 1) % n);
+    g.cpu_graph_table->add_comm_edge(i, (i - 1 + n) % n);
+    if (i % 2 == 0) ids0.push_back(i);
+  }
+  ids1.push_back(5);
+  vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(ids0));
+  vec.push_back(g.cpu_graph_table->make_gpu_ps_graph(ids1));
+  vec[0].display_on_cpu();
+  vec[1].display_on_cpu();
+  g.build_graph_from_cpu(vec);
+  int64_t cpu_key[3] = {0, 1, 2};
+  void *key;
+  platform::CUDADeviceGuard guard(0);
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 2, 3);
+  int64_t *res = new int64_t[7];
+  cudaMemcpy(res, neighbor_sample_res->val, 3 * 2 * sizeof(int64_t),
+             cudaMemcpyDeviceToHost);
+  int *actual_sample_size = new int[3];
+  cudaMemcpy(actual_sample_size, neighbor_sample_res->actual_sample_size,
+             3 * sizeof(int),
+             cudaMemcpyDeviceToHost);  // 3, 1, 3
+
+  //{0,9} or {9,0} is expected for key 0
+  //{0,2} or {2,0} is expected for key 1
+  //{1,3} or {3,1} is expected for key 2
+  for (int i = 0; i < 3; i++) {
+    VLOG(0) << "actual sample size for " << i << " is "
+            << actual_sample_size[i];
+    for (int j = 0; j < actual_sample_size[i]; j++) {
+      VLOG(0) << "sampled an neighbor for node" << i << " : " << res[i * 2 + j];
+    }
+  }
+}
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
index 887bda4be4a89..07e561fb3b050 100644
--- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu
@@ -86,6 +86,7 @@ void testSampleRate() {
   int start = 0;
   pthread_rwlock_t rwlock;
   pthread_rwlock_init(&rwlock, NULL);
+
   {
     ::paddle::distributed::GraphParameter table_proto;
     // table_proto.set_gpups_mode(false);
@@ -93,9 +94,9 @@ void testSampleRate() {
     table_proto.set_task_pool_size(24);
     std::cerr << "initializing begin";
     distributed::GraphTable graph_table;
-    graph_table.initialize(table_proto);
+    graph_table.Initialize(table_proto);
     std::cerr << "initializing done";
-    graph_table.load(input_file, std::string("e>"));
+    graph_table.Load(input_file, std::string("e>"));
     int sample_actual_size = -1;
     int step = fixed_key_size, cur = 0;
     while (sample_actual_size != 0) {
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 5e1a08f33e3ef..52bfe42cc5028 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include <algorithm>
 #include <deque>
 
+#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/platform/timer.h"
 
@@ -690,7 +691,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
     }
 #endif
     VLOG(3) << "GpuPs build hbmps done";
-
   };
 
   if (multi_mf_dim_) {
@@ -753,7 +753,9 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
   }
   std::vector<std::thread> threads(device_num);
   HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
+#ifdef PADDLE_WITH_CUDA
   HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
+#endif
   auto build_func = [this, &gpu_task, &feature_keys_count](int i) {
     VLOG(3) << "building table: " << i;
     this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(),
@@ -891,18 +893,27 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
                               const std::vector<float*>& values,
                               const std::vector<int64_t>& slot_lengths,
                               const int hidden_size) {
-  VLOG(3) << "Begine Gpu Ps PullSparse";
   platform::Timer all_timer;
   platform::Timer pull_gpups_timer;
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+#ifdef PADDLE_WITH_CUDA
+  VLOG(3) << "Begine Gpu Ps PullSparse";
   auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
   FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+  VLOG(3) << "Begine Xpu Ps PullSparse";
+  FeatureValue* total_values_gpu = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
+             total_length * sizeof(FeatureValue));
+#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GpuPs now."));
   } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
@@ -942,9 +953,63 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
                       static_cast<int>(slot_lengths.size()), hidden_size,
                       total_length);
+#endif
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU_KP
+    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
+        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
+
+    // construct slot_level lod info
+    auto slot_lengths_lod = slot_lengths;
+    for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+      slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+    }
+
+    uint64_t* buf_key = nullptr;
+    int64_t* buf_length = nullptr;
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_key),
+                                 keys.size() * sizeof(uint64_t*)),
+                      XPU_SUCCESS, platform::errors::ResourceExhausted(
+                                       "XPU has no enough memory"));
+    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_length),
+                                 slot_lengths.size() * sizeof(int64_t)),
+                      XPU_SUCCESS, platform::errors::ResourceExhausted(
+                                       "XPU has no enough memory"));
+
+    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(&buf_key);
+    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length);
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(),
+                                          keys.size() * sizeof(uint64_t*),
+                                          XPU_HOST_TO_DEVICE));
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, slot_lengths_lod.data(),
+                                          slot_lengths.size() * sizeof(int64_t),
+                                          XPU_HOST_TO_DEVICE));
+
+    this->CopyKeys(place, xpu_keys, total_keys, xpu_len,
+                   static_cast<int>(slot_lengths.size()),
+                   static_cast<int>(total_length));
+    VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+    pull_gpups_timer.Start();
+    HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
+                          static_cast<int>(total_length));
+    // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
+    //                              "PullSparseGPU failed in GPUPS."));
+    pull_gpups_timer.Pause();
+
+    VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
+            << "]";
+    this->CopyForPull(place, xpu_keys, values, total_values_gpu, xpu_len,
+                      static_cast<int>(slot_lengths.size()), hidden_size,
+                      total_length);
+#endif
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "GpuPs: PullSparse Only Support CUDAPlace Now."));
+        "GpuPs/XpuPs: PullSparse Only Support CUDAPlace or XPUPlace Now."));
   }
   all_timer.Pause();
   VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
@@ -959,15 +1024,23 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
                                   const std::vector<const float*>& grad_values,
                                   const std::vector<int64_t>& slot_lengths,
                                   const int hidden_size, const int batch_size) {
-  VLOG(3) << "Begin GPUPS PushSparseGrad";
   platform::Timer all_timer;
   platform::Timer push_gpups_timer;
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
+#ifdef PADDLE_WITH_CUDA
+  VLOG(3) << "Begin GPUPS PushSparseGrad";
   auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue));
   FeaturePushValue* total_grad_values_gpu =
       reinterpret_cast<FeaturePushValue*>(buf->ptr());
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+  VLOG(3) << "Begine Xpu Ps PushSparseGrad";
+  FeaturePushValue* total_grad_values_gpu = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&total_grad_values_gpu),
+             total_length * sizeof(FeaturePushValue));
+#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GPUPS now."));
@@ -987,6 +1060,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+  } else if (platform::is_xpu_place(place)) {
+    int device_id = place.GetDeviceId();
+    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
+    LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
+    uint64_t* total_keys =
+        reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
+    VLOG(3) << "Begin copy grad tensor to xpups struct";
+    this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
+                      hidden_size, total_length, batch_size);
+
+    VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index
+            << " len: " << total_length;
+    push_gpups_timer.Start();
+    HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
+                          static_cast<int>(total_length));
+    push_gpups_timer.Pause();
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 6a78a617b1fef..cf7d98db27e84 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -105,6 +105,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
   }
 }
 
+PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
+
 void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
                                uint64_t** gpu_keys,
                                const std::vector<float*>& values,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
old mode 100755
new mode 100644
index c5f674d8b47eb..c38b819822c28
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -30,16 +30,22 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 #include "paddle/fluid/distributed/ps/thirdparty/round_robin.h"
-#include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/fleet/heter_context.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/heter_util.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#endif
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_PSCORE
@@ -55,6 +61,8 @@ namespace framework {
 #define TYPEALIGN(ALIGNVAL, LEN) \
   (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
 
+class Dataset;
+
 #ifdef PADDLE_WITH_PSLIB
 class AfsWrapper {
  public:
@@ -82,7 +90,7 @@ class AfsWrapper {
 
 class PSGPUWrapper {
  public:
-  virtual ~PSGPUWrapper() { delete HeterPs_; }
+  virtual ~PSGPUWrapper();
 
   PSGPUWrapper() {
     HeterPs_ = NULL;
@@ -160,6 +168,7 @@ class PSGPUWrapper {
       PADDLE_THROW(
           platform::errors::Unavailable("heter ps need compile with GLOO"));
 #endif
+#ifdef PADDLE_WITH_CUDA
       if (multi_node_) {
         int dev_size = dev_ids.size();
         // init inner comm
@@ -195,6 +204,7 @@ class PSGPUWrapper {
             platform::errors::Unavailable("heter ps need compile with GLOO"));
 #endif
       }
+#endif
       heter_devices_ = dev_ids;
       data_ready_channel_->Open();
       data_ready_channel_->SetCapacity(3);
@@ -262,7 +272,11 @@ class PSGPUWrapper {
                              ? 1.0
                              : config["mf_max_bound"];
     for (size_t i = 0; i < heter_devices_.size(); i++) {
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i]));
+#elif defined(PADDLE_WITH_XPU_KP)
+      PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(heter_devices_[i]));
+#endif
       this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound,
                          learning_rate, initial_g2sum, initial_range);
       this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate,
@@ -270,6 +284,7 @@ class PSGPUWrapper {
                          mf_max_bound);
     }
   }
+
   void SetDate(int year, int month, int day) {
     year_ = year;
     month_ = month;
@@ -297,6 +312,7 @@ class PSGPUWrapper {
     slot_offset_vector_ = slot_offset_vector;
   }
 
+#ifdef PADDLE_WITH_CUDA
   void SetSlotDimVector(const std::vector<int>& slot_mf_dim_vector) {
     slot_mf_dim_vector_ = slot_mf_dim_vector;
     assert(slot_mf_dim_vector_.size() == slot_vector_.size());
@@ -330,6 +346,7 @@ class PSGPUWrapper {
     grad_type_size_ =
         TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float)));
   }
+#endif
 
   void ShowOneTable(int index) { HeterPs_->show_one_table(index); }
 
@@ -371,9 +388,11 @@ class PSGPUWrapper {
   int multi_node_{0};
   int node_size_;
   uint64_t table_id_;
+#ifdef PADDLE_WITH_CUDA
   std::vector<ncclComm_t> inner_comms_;
   std::vector<ncclComm_t> inter_comms_;
   std::vector<ncclUniqueId> inter_ncclids_;
+#endif
   std::vector<int> heter_devices_;
   std::unordered_set<std::string> gpu_ps_config_keys_;
   HeterObjectPool<HeterContext> gpu_task_pool_;
@@ -388,9 +407,11 @@ class PSGPUWrapper {
   int day_;
   int use_afs_api_ = 0;
 
+#ifdef PADDLE_WITH_CUDA
   std::vector<MemoryPool*> mem_pools_;
   std::vector<HBMMemoryPool*> hbm_pools_;  // in multi mfdim, one table need hbm
                                            // pools of totol dims number
+#endif
 
   std::shared_ptr<
       paddle::framework::ChannelObject<std::shared_ptr<HeterContext>>>
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
new file mode 100644
index 0000000000000..6d69ae0136d68
--- /dev/null
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -0,0 +1,339 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_HETERPS
+#include <xpu/runtime.h>  // NOLINT
+#include <algorithm>
+#include <ctime>
+#include <memory>
+#include <numeric>
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "xpu/kernel/cluster_header.h"  // NOLINT
+#include "xpu/kernel/debug.h"           // NOLINT
+#include "xpu/kernel/math.h"            // NOLINT
+#include "xpu/kernel/simd.h"
+
+namespace paddle {
+namespace framework {
+
+__global__ void PullCopy(float** dest, const FeatureValue* src,
+                         const long long* len, int hidden, int slot_num,
+                         int total_len, unsigned long long** keys) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  __local__ int64_t local_len[slot_num];
+  GM2LM(len, local_len, slot_num * sizeof(int64_t));
+
+  for (int i = thread_id; i < slot_num; i += nthreads) {
+    // max core local memory = 8KB
+    // slot's max memory size = slot_len * sizeof(FeatureValue)
+    int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
+    int read_len = min(roundup_div(1024 * 8, sizeof(FeatureValue)), slot_len);
+    int dest_len = i ? local_len[i - 1] : 0;
+    __local__ FeatureValue local_slot_vals[read_len];
+    __local__ float local_dest_vals[read_len * hidden];
+    __local__ uint64_t local_slot_keys[read_len];
+
+    // copy read_len (length) of slots' val to LM
+    for (int k = 0; k < slot_len; k += read_len) {
+      int real_read_len = min(read_len, slot_len - k);
+      GM2LM(src + dest_len + k, local_slot_vals,
+            real_read_len * sizeof(FeatureValue));
+      GM2LM(keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      for (int j = 0; j < real_read_len; j++) {
+        if (local_slot_keys[j] == 0) {
+          local_dest_vals[j * hidden] = 0;
+          local_dest_vals[j * hidden + 1] = 0;
+          local_dest_vals[j * hidden + 2] = 0;
+        } else {
+          local_dest_vals[j * hidden] = local_slot_vals[j].show;
+          local_dest_vals[j * hidden + 1] = local_slot_vals[j].clk;
+          local_dest_vals[j * hidden + 2] = local_slot_vals[j].lr;
+        }
+
+        if (local_slot_vals[j].mf_size == 0 || local_slot_keys[j] == 0) {
+          for (int m = 0; m < hidden - 3; m++) {
+            local_dest_vals[j * hidden + 3 + m] = 0;
+          }
+        } else {
+          for (int m = 0; m < hidden - 3; m++) {
+            local_dest_vals[j * hidden + 3 + m] = local_slot_vals[j].mf[1 + m];
+          }
+        }
+      }
+      LM2GM(local_dest_vals, dest[i] + k * hidden,
+            real_read_len * hidden * sizeof(float));
+    }
+  }
+}
+
+__global__ void CopyKeysKernel(unsigned long long** src_keys,
+                               unsigned long long* dest_total_keys,
+                               const long long* len, int slot_num,
+                               int total_len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  __local__ int64_t local_len[slot_num];
+  GM2LM(len, local_len, slot_num * sizeof(int64_t));
+
+  for (int i = thread_id; i < slot_num; i += nthreads) {
+    // max core local memory = 8KB
+    int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
+    int read_len = min(slot_len, 1024);
+    int dest_len = i ? local_len[i - 1] : 0;
+    __local__ uint64_t local_slot_keys[read_len];
+
+    for (int k = 0; k < slot_len; k += read_len) {
+      int real_read_len = min(read_len, slot_len - k);
+      GM2LM(src_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      LM2GM(local_slot_keys, dest_total_keys + dest_len + k,
+            real_read_len * sizeof(uint64_t));
+    }
+  }
+}
+
+__global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
+                         int hidden, int slot_num, int total_len, int bs,
+                         int* slot_vector) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  __local__ int64_t local_len[slot_num];
+  __local__ int local_slot[slot_num];
+  GM2LM(len, local_len, slot_num * sizeof(int64_t));
+  GM2LM(slot_vector, local_slot, slot_num * sizeof(int));
+
+  for (int i = thread_id; i < slot_num; i += nthreads) {
+    int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
+
+    // max core local memory = 8KB
+    // slot's max memory size = slot_len * hidden * 8
+    int read_len = min(roundup_div(1024, hidden), slot_len);
+    int dest_len = i ? local_len[i - 1] : 0;
+    __local__ float local_slot_grads[read_len * hidden];
+    __local__ FeaturePushValue local_dest_grads[read_len];
+
+    // copy read_len(length) of slots' grad to LM
+    for (int k = 0; k < slot_len; k += read_len) {
+      int real_read_len = min(read_len, slot_len - k);
+      GM2LM(src[i] + k * hidden, local_slot_grads,
+            real_read_len * hidden * sizeof(float));
+      // copy from slots' grad to total grad
+      for (int j = 0; j < real_read_len; j++) {
+        local_dest_grads[j].slot = local_slot[i];
+        local_dest_grads[j].show = local_slot_grads[j * hidden];
+        local_dest_grads[j].clk = local_slot_grads[j * hidden + 1];
+        local_dest_grads[j].lr_g = local_slot_grads[j * hidden + 2] * -1. * bs;
+        for (int m = 0; m < hidden - 3; m++) {
+          local_dest_grads[j].mf_g[m] =
+              local_slot_grads[j * hidden + 3 + m] * -1. * bs;
+        }
+      }
+      LM2GM(local_dest_grads, dest + dest_len + k,
+            real_read_len * sizeof(FeaturePushValue));
+    }
+  }
+}
+
+PSGPUWrapper::~PSGPUWrapper() {
+  delete HeterPs_;
+  xpu_free((void*)optimizer_config::nonclk_coeff);
+  xpu_free((void*)optimizer_config::clk_coeff);
+  xpu_free((void*)optimizer_config::min_bound);
+  xpu_free((void*)optimizer_config::max_bound);
+  xpu_free((void*)optimizer_config::learning_rate);
+  xpu_free((void*)optimizer_config::initial_g2sum);
+  xpu_free((void*)optimizer_config::initial_range);
+
+  xpu_free((void*)optimizer_config::mf_create_thresholds);
+  xpu_free((void*)optimizer_config::mf_learning_rate);
+  xpu_free((void*)optimizer_config::mf_initial_g2sum);
+  xpu_free((void*)optimizer_config::mf_initial_range);
+  xpu_free((void*)optimizer_config::mf_min_bound);
+  xpu_free((void*)optimizer_config::mf_max_bound);
+}
+
+void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
+                               uint64_t** gpu_keys,
+                               const std::vector<float*>& values,
+                               const FeatureValue* total_values_gpu,
+                               const int64_t* gpu_len, const int slot_num,
+                               const int hidden_size,
+                               const int64_t total_length) {
+  XPUStream stream = nullptr;
+  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+               ->x_context()
+               ->xpu_stream;
+  float* buf_value = nullptr;
+  xpu_malloc(reinterpret_cast<void**>(&buf_value),
+             values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(&buf_value);
+  xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+             XPU_HOST_TO_DEVICE);
+
+  unsigned long long** c_keys = (unsigned long long**)gpu_keys;
+  const long long* c_len = (const long long*)gpu_len;
+  PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
+                              slot_num, total_length, c_keys);
+
+  xpu_wait(stream);
+}
+
+void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
+                            uint64_t** origin_keys, uint64_t* total_keys,
+                            const int64_t* gpu_len, int slot_num,
+                            int total_len) {
+  XPUStream stream = nullptr;
+  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+               ->x_context()
+               ->xpu_stream;
+  unsigned long long** o_keys = (unsigned long long**)origin_keys;
+  unsigned long long* t_keys = (unsigned long long*)total_keys;
+  const long long* c_len = (const long long*)gpu_len;
+  CopyKeysKernel<<<2, 64, stream>>>(o_keys, t_keys, c_len, slot_num, total_len);
+  xpu_wait(stream);
+}
+
+void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
+                               const std::vector<const float*>& grad_values,
+                               FeaturePushValue* total_grad_values_gpu,
+                               const std::vector<int64_t>& slot_lengths,
+                               const int hidden_size,
+                               const int64_t total_length,
+                               const int batch_size) {
+  XPUStream stream = nullptr;
+  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+               ->x_context()
+               ->xpu_stream;
+  auto slot_lengths_lod = slot_lengths;
+  for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+
+  float* buf_grad_value = nullptr;
+  int64_t* buf_length = nullptr;
+  int* buf_slot_vector = nullptr;
+
+  xpu_malloc(reinterpret_cast<void**>(&buf_grad_value),
+             grad_values.size() * sizeof(float*));
+  xpu_malloc(reinterpret_cast<void**>(&buf_length),
+             slot_lengths.size() * sizeof(int64_t));
+  xpu_malloc(reinterpret_cast<void**>(&buf_slot_vector),
+             slot_lengths_lod.size() * sizeof(int));
+
+  float** gpu_values = reinterpret_cast<float**>(&buf_grad_value);
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length);
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector);
+  xpu_memcpy(gpu_values, grad_values.data(),
+             grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
+  xpu_memcpy(gpu_len, slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE);
+  xpu_memcpy(d_slot_vector, slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE);
+
+  long long* c_len = (long long*)gpu_len;
+  PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len,
+                              hidden_size, slot_lengths.size(), total_length,
+                              batch_size, d_slot_vector);
+  xpu_wait(stream);
+}
+
+void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
+                                float min_bound, float max_bound,
+                                float learning_rate, float initial_g2sum,
+                                float initial_range) {
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::nonclk_coeff),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::clk_coeff),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::min_bound),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::max_bound),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::learning_rate),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::initial_g2sum),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::initial_range),
+             sizeof(float));
+
+  xpu_memcpy((void*)optimizer_config::nonclk_coeff, &nonclk_coeff,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::clk_coeff, &clk_coeff, sizeof(float),
+             XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::min_bound, &min_bound, sizeof(float),
+             XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::max_bound, &max_bound, sizeof(float),
+             XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::learning_rate, &learning_rate,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::initial_g2sum, &initial_g2sum,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::initial_range, &initial_range,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+}
+
+void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
+                                float mf_learning_rate, float mf_initial_g2sum,
+                                float mf_initial_range, float mf_min_bound,
+                                float mf_max_bound) {
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::mf_create_thresholds),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::mf_learning_rate),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::mf_initial_g2sum),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::mf_initial_range),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::mf_min_bound),
+             sizeof(float));
+  xpu_malloc(reinterpret_cast<void**>(&optimizer_config::mf_max_bound),
+             sizeof(float));
+
+  xpu_memcpy((void*)optimizer_config::mf_create_thresholds,
+             &mf_create_thresholds, sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::mf_initial_g2sum, &mf_initial_g2sum,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::mf_initial_range, &mf_initial_range,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::mf_min_bound, &mf_min_bound,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::mf_max_bound, &mf_max_bound,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+  xpu_memcpy((void*)optimizer_config::mf_learning_rate, &mf_learning_rate,
+             sizeof(float), XPU_HOST_TO_DEVICE);
+}
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
index 7846016d7e7b2..2625cb48174b8 100644
--- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
+++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
@@ -67,6 +67,7 @@ AdaptivePool2dConvertGlobalPass::AdaptivePool2dConvertGlobalPass() {
 
 void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const {
   std::string name_scope = "adaptive_pool2d_convert_global_pass";
+
   FusePassBase::Init(name_scope, graph);
   int num = 0;
   for (const Node* n : graph->Nodes()) {
@@ -77,13 +78,13 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const {
         if (op->HasAttr("global_pooling")) {
           bool global_pooling =
               BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
-          if (global_pooling) return;
+          if (global_pooling) continue;
         }
-        if (!op->HasAttr("pooling_type")) return;
+        if (!op->HasAttr("pooling_type")) continue;
         std::string type =
             BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
         // adaptive has no effect on max pooling
-        if (type == "max") return;
+        if (type == "max") continue;
         bool adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive"));
         std::vector<int> ksize =
             BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 20a6e53479323..a4fcf0773f623 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -121,6 +121,9 @@ paddle::framework::FetchList InterpreterCore::Run(
   Prepare(feed_names, feed_tensors, is_build);
 
   if (is_build) {
+    // add listener before run and is_build=true
+    global_scope_->ResetListener();
+
     ExecuteInstructionList(vec_instruction_);
   }
 
@@ -128,6 +131,9 @@ paddle::framework::FetchList InterpreterCore::Run(
     ClearLoDTensorArrayInLocalScope();
   }
 
+  // clear the listener after run
+  global_scope_->ClearListener();
+
   // return Fetch Tensors
   auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName);
   return std::move(*fetch_var->GetMutable<framework::FetchList>());
@@ -162,6 +168,9 @@ paddle::framework::FetchList InterpreterCore::Run(
     Convert(&op_func_nodes);
 
   } else {
+    // add listener before run and is_build=true
+    global_scope_->ResetListener();
+
     ExecuteInstructionList(vec_instruction_);
   }
 
@@ -169,6 +178,9 @@ paddle::framework::FetchList InterpreterCore::Run(
     ClearLoDTensorArrayInLocalScope();
   }
 
+  // clear the listener after run
+  global_scope_->ClearListener();
+
   // return Fetch Tensors
   auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName);
   return std::move(*fetch_var->GetMutable<framework::FetchList>());
@@ -192,7 +204,8 @@ void InterpreterCore::BuildOperatorDependences() {
   // Schedule
   auto op_nums = vec_instruction_.size();
   dependecy_count_.resize(op_nums);
-  auto op2downstream = interpreter::build_op_downstream_map(vec_instruction_);
+  auto op2downstream = interpreter::build_op_downstream_map(
+      vec_instruction_, &op_happens_before_);
   for (size_t op = 0; op < vec_instruction_.size(); ++op) {
     auto op_list = op2downstream[op];
     std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
@@ -213,18 +226,21 @@ void InterpreterCore::Convert(
 
   auto op_nums = nodes.size();
   vec_instruction_.reserve(op_nums);
-
   for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
     auto& op_func_node = nodes[op_idx];
     auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
-
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
-    auto& instr = vec_instruction_.back();
+  }
+
+  BuildOperatorDependences();
 
+  // calculate last_live_ops_
+  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
+    auto& instr = vec_instruction_[op_idx];
     OpInOutInfo info;
-    std::vector<size_t> gc_check_input_list;
+    std::set<size_t> gc_check_inputs;
 
-    for (auto& item : op_func_node.input_index) {
+    for (auto& item : instr.Inputs()) {
       for (auto id : item.second) {
         if (id == kEmptyVarIndex) {
           continue;
@@ -232,38 +248,24 @@ void InterpreterCore::Convert(
         input_var2op_info_.at(id).push_back(op_idx);
         // var can be gc-ed
         if (!info.IsBuilt()) {
-          info.Build(op_func_node.operator_base_.get());
+          info.Build(instr.OpBase());
         }
         auto* var_desc = global_scope_->VarDesc(id);
         if (var_desc) {
           if (info.IsInArgBufferNeeded(var_desc->Name())) {
-            gc_check_input_list.push_back(id);
+            gc_check_inputs.insert(id);
           }
         } else {
-          gc_check_input_list.push_back(id);
+          gc_check_inputs.insert(id);
         }
       }
     }
-    std::sort(gc_check_input_list.begin(), gc_check_input_list.end());
-    auto last =
-        std::unique(gc_check_input_list.begin(), gc_check_input_list.end());
-    gc_check_input_list.erase(last, gc_check_input_list.end());
 
-    for (auto var_id : gc_check_input_list) {
+    for (auto var_id : gc_check_inputs) {
       paddle::framework::Variable* var = global_scope_->Var(var_id);
       if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
           var->IsType<LoDTensorArray>()) {
-        vec_meta_info[var_id].var_ref_count_++;
-        // TODO(zhiqiu): not all var needs to be checked, var need to be checked
-        // only
-        // after the last_live_op. For example,
-        // b = op1(a)
-        // c = op2(a, b)
-        // in this case, a is the input of op1 and op2, we only need to check
-        // a after op2, because op2 always uses a after op1.
-        instr.AddGCCheckVar(var_id);
-        VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after "
-                << instr.OpBase()->Type();
+        last_live_ops_[var_id].insert(op_idx);
       } else {
         VLOG(4) << "not clear " << global_scope_->GetNameById(var_id)
                 << " after " << instr.OpBase()->Type()
@@ -276,19 +278,45 @@ void InterpreterCore::Convert(
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     // checkout ouput
     for (auto& item : vec_instruction_[i].Outputs()) {
-      for (auto id : item.second) {
-        if (input_var2op_info_.at(id).size() == 0) {
-          // output var not be used by any kernel
-          vec_instruction_[i].AddGCCheckVar(id);
-          VLOG(4) << "clear " << global_scope_->GetNameById(id) << " after "
-                  << vec_instruction_[i].OpBase()->Type();
-          vec_meta_info[id].var_ref_count_++;
+      for (auto var_id : item.second) {
+        if (input_var2op_info_.at(var_id).size() == 0) {
+          last_live_ops_[var_id].insert(i);
         }
       }
     }
   }
 
-  BuildOperatorDependences();
+  // shrink, find the downstream op that has no other op in the
+  // downstream list happens before it
+  // For example,
+  // b = op1(a)
+  // c = op2(a, b)
+  // in this case, a is the input of op1 and op2, we only need to check
+  // a after op2, because op2 always uses a after op1.
+  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
+    std::set<size_t> minumum_last_live_ops;
+    for (size_t item : last_live_ops_[i]) {
+      bool not_before_any = true;
+      // find the op that is not executed before any
+      for (size_t other_item : last_live_ops_[i]) {
+        if (op_happens_before_[item][other_item]) {
+          VLOG(8) << "happens_before: " << item << "->" << other_item
+                  << ", so skip " << item;
+          not_before_any = false;
+          break;
+        }
+      }
+      if (not_before_any) {
+        VLOG(8) << "last live op of var " << i << " "
+                << global_scope_->GetNameById(i) << " : " << item << " "
+                << vec_instruction_[item].OpBase()->Type();
+        minumum_last_live_ops.insert(item);
+        vec_instruction_[item].AddGCCheckVar(i);
+      }
+    }
+    last_live_ops_[i] = minumum_last_live_ops;
+    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
+  }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
     BuildAndCacheInstructionCtx(&vec_instruction_[i]);
@@ -400,8 +428,17 @@ void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
     }
     outs_map.emplace(var_name_item.first, std::move(out_vars));
   }
+
   // set runtime_ctx and infershape_ctx_
-  instr_node->ResetContext(ins_map, outs_map);
+  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
+                                                        // kernel
+    Scope* local_scope = create_local_scope_
+                             ? global_scope_->GetMutableLocalScope()
+                             : global_scope_->GetMutableScope();
+    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
+  } else {
+    instr_node->ResetContext(ins_map, outs_map);
+  }
 }
 
 void InterpreterCore::BuildSkipShareLoDInfo() {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index c1ade85e1384c..3af0ddb675a45 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -109,6 +109,11 @@ class InterpreterCore {
 
   std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
 
+  // op_happens_before_[i][j] == true means op[i] happens before op[j]
+  std::vector<std::vector<bool>> op_happens_before_;
+  // last_live_ops_[i] contains the id of operatos that last access var[i]
+  std::map<size_t, std::set<size_t>> last_live_ops_;
+
   std::vector<size_t> dependecy_count_;
   std::atomic<size_t> unfinished_op_numer_{0};
   std::vector<std::vector<size_t>> input_var2op_info_;
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 63fcf0cffaa84..afddcb580b9d8 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -172,6 +172,8 @@ void build_variable_scope(const framework::BlockDesc& block,
       auto* ptr = inner_scope->Var(var_name);
 
       VLOG(3) << "Initialize Variable " << var_name;
+      // NOTE(zhiqiu): if var exists in scope and the type is right,
+      // InitializeVariable will not create a new variable.
       InitializeVariable(ptr, var_desc->GetType());
       VLOG(3) << "Create Variable " << var_name << " global, which pointer is "
               << ptr << " type is " << static_cast<int>(var_desc->GetType());
@@ -390,8 +392,19 @@ void build_op_func_list(const platform::Place& place,
           platform::DeviceContextPool::Instance();
       auto* dev_ctx = pool.Get(place);
       Scope scope;
+      Scope* runtime_scope = &scope;
+      // NOTE(Ruibiao): We do not encourage directly using scope in OP kernel.
+      // But some OPs do have such behavior (e.g., cinn_launch OP). Here special
+      // treatment for them.
+      if (op_with_kernel->Type() == "cinn_launch") {
+        VLOG(6) << "OP(" << op_with_kernel->Type() << ") use scope in kernel, "
+                                                      "so pass a real scope to "
+                                                      "ExecutionContext";
+        runtime_scope = local_scope;
+      }
+
       auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
-          ExecutionContext(*op, scope, *dev_ctx, runtime_context));
+          ExecutionContext(*op, *runtime_scope, *dev_ctx, runtime_context));
       op_with_kernel->ResetKernelType(new OpKernelType(expected_kernel_key));
 
       // change device by the device_guard()
@@ -439,8 +452,8 @@ void build_op_func_list(const platform::Place& place,
         op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
       }
 
-      auto exec_ctx =
-          ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
+      auto exec_ctx = ExecutionContext(*op_with_kernel, *runtime_scope,
+                                       *dev_ctx, runtime_context);
 
       auto run_phi_kernel = false;
       if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
@@ -614,23 +627,125 @@ void update_var_min_rw_op(const std::map<int, std::set<int>>& op2dependences,
 }
 
 std::map<int, std::list<int>> get_downstream_map(
-    const std::map<int, std::set<int>>& op2dependences) {
-  // op2dependences is op -> it's dependences. we want to get op -> [ops] map,
+    const std::map<int, std::set<int>>& op2dependences,
+    std::vector<std::vector<bool>>* op_happens_before) {
+  // step1: convert op2dependences to downstream_map directly
+  // op2dependences is op -> it's dependences.
+  // we want to get op -> [next ops] map,
   // where ops is the next instruction of op.
-  std::map<int, std::list<int>> result;
+  std::map<int, std::list<int>> downstream;
   for (auto& item : op2dependences) {
     int op = item.first;
     for (auto dep_op : item.second) {
-      if (result.find(dep_op) == result.end())
-        result[dep_op] = std::list<int>();
-      result[dep_op].push_back(op);
+      if (downstream.find(dep_op) == downstream.end())
+        downstream[dep_op] = std::list<int>();
+      downstream[dep_op].push_back(op);
+    }
+  }
+
+  auto downstream_map_to_str = [&]() -> std::string {
+    std::ostringstream oss;
+    for (auto pair : downstream) {
+      oss << pair.first << " -> ";
+      std::copy(pair.second.begin(), pair.second.end(),
+                std::ostream_iterator<int>(oss, " "));
+      oss << std::endl;
+    }
+    return oss.str();
+  };
+
+  auto downstream_map_count = [&]() -> size_t {
+    size_t count = 0;
+    for (auto pair : downstream) {
+      count += pair.second.size();
+    }
+    return count;
+  };
+
+  VLOG(6) << "downstream count: " << downstream_map_count();
+  VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
+
+  // step2: remove unneccessary downstream ops
+  // for example, a->b->c
+  // a: b, c
+  // b: c
+  // =>
+  // a: b
+  // b: c
+
+  // NOTE(zhiqiu): the size of downstream != size of op2dependences
+  // since there are some ops that have no downstream-op.
+  auto op_num = op2dependences.size();
+  // happens_before[i][j] means i should be executed before j
+  op_happens_before->resize(op_num);
+  for (size_t i = 0; i < op_num; ++i) {
+    (*op_happens_before)[i].resize(op_num);
+    std::fill((*op_happens_before)[i].begin(), (*op_happens_before)[i].end(),
+              false);
+  }
+
+  // bfs to get all next ops
+  auto bfs = [&](size_t op_idx) {
+    std::queue<size_t> q;
+    std::vector<bool> visited(op_num, false);
+    q.push(op_idx);
+    while (!q.empty()) {
+      size_t op = q.front();
+      q.pop();
+      visited[op] = true;
+      if (!downstream.count(op)) {
+        continue;
+      }
+      for (auto next : downstream[op]) {
+        if (!visited[next]) {
+          PADDLE_ENFORCE_EQ((*op_happens_before)[next][op_idx], false,
+                            paddle::platform::errors::AlreadyExists(
+                                "There exists circle in graph, expected "
+                                "%d->%d, but already got %d->%d",
+                                op_idx, next, next, op_idx));
+          (*op_happens_before)[op_idx][next] = true;
+          VLOG(8) << "happens before: " << op_idx << " " << next;
+          q.push(next);
+        }
+      }
+    }
+  };
+
+  for (size_t i = 0; i < op_num; ++i) {
+    bfs(i);
+  }
+
+  // shrink, find the downstream op that has no other op in the
+  // downstream list happens before it
+  for (size_t i = 0; i < op_num; ++i) {
+    std::list<int> minumum_nexts;
+    for (size_t item : downstream[i]) {
+      bool not_after_any = true;
+      // find the op that is not executed after any
+      for (size_t other_item : downstream[i]) {
+        if ((*op_happens_before)[other_item][item]) {
+          VLOG(8) << "happens_before: " << other_item << "->" << item
+                  << ", so skip " << item;
+          not_after_any = false;
+          break;
+        }
+      }
+      if (not_after_any) {
+        VLOG(8) << "downstream op of " << i << ": " << item;
+        minumum_nexts.push_back(item);
+      }
     }
+    downstream[i] = minumum_nexts;
   }
-  return std::move(result);
+  VLOG(6) << "downstream count: " << downstream_map_count();
+  VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
+
+  return std::move(downstream);
 }
 
 std::map<int, std::list<int>> build_op_downstream_map(
-    const std::vector<Instruction>& vec_instruction) {
+    const std::vector<Instruction>& vec_instruction,
+    std::vector<std::vector<bool>>* op_happens_before) {
   auto var2min_rw_op = std::map<
       int, std::list<int>>();  // # map from variable id to read / write op id.
   auto var2recent_write_op =
@@ -873,13 +988,13 @@ std::map<int, std::list<int>> build_op_downstream_map(
     }
   }
   for (auto pair : op2dependences) {
-    VLOG(10) << pair.first << " Depends on " << pair.second.size();
     std::ostringstream oss;
+    oss << pair.first << " Depends on " << pair.second.size() << " ops: ";
     std::copy(pair.second.begin(), pair.second.end(),
               std::ostream_iterator<int>(oss, " "));
     VLOG(10) << oss.str();
   }
-  return std::move(get_downstream_map(op2dependences));
+  return std::move(get_downstream_map(op2dependences, op_happens_before));
 }
 
 }  // namespace interpreter
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 044a9ea368cbc..56683330ee6cb 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -116,7 +116,8 @@ void build_op_func_list(const platform::Place& place,
                         VariableScope* var_scope, bool use_local_scope = true);
 
 std::map<int, std::list<int>> build_op_downstream_map(
-    const std::vector<Instruction>& vec_instruction);
+    const std::vector<Instruction>& vec_instruction,
+    std::vector<std::vector<bool>>* op_happens_before);
 
 void add_fetch(const std::vector<std::string>& fetch_names,
                framework::BlockDesc* block);
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 8f6bac76e2a15..3c2395d4320a1 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -642,6 +642,28 @@ void VariableScope::CheckExist(const std::string& name) const {
                                             "%s not in VariableScope.", name));
 }
 
+void VariableScope::ClearListener() {
+  if (scope_ && listener_ && scope_->HasListener(listener_)) {
+    VLOG(4) << "Clear listener " << listener_ << " for " << scope_;
+    scope_->DelListener(listener_);
+  }
+  if (local_scope_ && listener_ && local_scope_->HasListener(listener_)) {
+    VLOG(4) << "Clear listener " << listener_ << " for " << local_scope_;
+    local_scope_->DelListener(listener_);
+  }
+}
+
+void VariableScope::ResetListener() {
+  if (scope_ && listener_ && !scope_->HasListener(listener_)) {
+    VLOG(4) << "Add listener " << listener_ << " for " << scope_;
+    scope_->AddListener(listener_);
+  }
+  if (local_scope_ && listener_ && !local_scope_->HasListener(listener_)) {
+    VLOG(4) << "Add listener " << listener_ << " for " << local_scope_;
+    local_scope_->AddListener(listener_);
+  }
+}
+
 VariableScopeListener::VariableScopeListener(VariableScope* var_scope) {
   var_scope_ = var_scope;
 }
@@ -733,6 +755,16 @@ void Instruction::ResetContext(const VariableValueMap& in_vars,
       new ExecutionContext(*OpBase(), scope_, dev_ctx_, *runtime_ctx_.get()));
 }
 
+void Instruction::ResetContextWithScope(const VariableValueMap& in_vars,
+                                        const VariableValueMap& out_vars,
+                                        const framework::Scope& scope) {
+  runtime_ctx_.reset(new RuntimeContext(in_vars, out_vars));
+  infershape_ctx_.reset(
+      new InterpretercoreInferShapeContext(*OpBase(), *runtime_ctx_.get()));
+  execution_ctx_.reset(
+      new ExecutionContext(*OpBase(), scope, dev_ctx_, *runtime_ctx_.get()));
+}
+
 std::shared_ptr<RuntimeContext> Instruction::InnerRuntimeContext() const {
   return runtime_ctx_;
 }
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index e257b71742400..28b9f6f0130f5 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -238,6 +238,10 @@ class VariableScope : public ScopeBase {
 
   bool GetVarSikpInplace(int id) const;
 
+  void ClearListener();
+
+  void ResetListener();
+
   friend class VariableScopeListener;
 
  private:
@@ -343,6 +347,10 @@ class Instruction {
   void ResetContext(const VariableValueMap& in_vars,
                     const VariableValueMap& out_vars);
 
+  void ResetContextWithScope(const VariableValueMap& in_vars,
+                             const VariableValueMap& out_vars,
+                             const framework::Scope& scope);
+
   std::shared_ptr<RuntimeContext> InnerRuntimeContext() const;
 
   std::shared_ptr<InterpretercoreInferShapeContext> InnerInferShapeContext()
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 4d4f7c74cd37e..31315df5701e5 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -25,19 +25,21 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
       startup_prog_(startup_prog),
       main_prog_(main_prog),
       global_scope_(VariableScope(scope)) {
-  // NOTE(zhiqiu): it is needed to sync thhe variables in scope to
-  // variable_scope,
-  // since the some variable only exists in startup program, e.g,
-  // lod_tensor_blocking_queue_0 used in dataloader.
-  // These variables may be created in scope during runing startup program with
-  // original executor.
+  // NOTE(zhiqiu): it is needed to sync the variables in scope to
+  // variable_scope, since the some variable only exists in scope.
+  // For example, 'lod_tensor_blocking_queue_0' used in dataloader.
+  // These variables may be created in scope, and it is not existed as
+  // variable in program.
   if (scope) {
-    auto name_list = scope->LocalVarNames();
-    for (auto name : name_list) {
-      VLOG(4) << "Sync Variable from variable scope: " << name;
-      auto v = scope->Var(name);
-      if (!global_scope_.HasVar(name)) {
-        global_scope_.AddVar(name, *v);
+    const std::string blocking_queue_prefix = "lod_tensor_blocking_queue";
+    auto vars = scope->LocalVarNames();
+    for (const auto& name : vars) {
+      if (name.find(blocking_queue_prefix) != std::string::npos) {
+        if (!global_scope_.HasVar(name)) {
+          auto* v = scope->Var(name);
+          VLOG(4) << "Sync Variable from scope to variable scope: " << name;
+          global_scope_.AddVar(name, *v);
+        }
       }
     }
   }
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index fe4b47cba6242..e03277fb31799 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -69,12 +69,17 @@ PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(concat_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(divide, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(maximum, GPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_XPU_KP
-PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 #else
-PD_DECLARE_KERNEL(add_raw, KPS, ALL_LAYOUT);
+PD_DECLARE_KERNEL(max_raw, KPS, ALL_LAYOUT);
 #endif
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(mean_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sigmoid, GPU, ALL_LAYOUT);
@@ -85,7 +90,6 @@ PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(transpose_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 6cde65f6ab580..51dca93c7c7f0 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -21,21 +21,21 @@
 #include <string>
 #include <unordered_map>
 
+#include "cinn/auto_schedule/auto_tuner.h"
+#include "cinn/auto_schedule/tuning.h"
 #include "cinn/common/target.h"
 #include "cinn/common/type.h"
-#include "cinn/frontend/decomposer/use_decomposer.h"
-#include "cinn/frontend/pass/use_program_pass.h"
-#include "cinn/frontend/program_pass.h"
+#include "cinn/frontend/optimize.h"
 #include "cinn/frontend/syntax.h"
 #include "cinn/hlir/framework/graph.h"
 #include "cinn/hlir/framework/graph_compiler.h"
-#include "cinn/hlir/framework/pass.h"
-#include "cinn/hlir/pass/use_pass.h"
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h"
 #include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -45,6 +45,8 @@
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
+DECLARE_bool(enable_pe_launch_cinn);
+DECLARE_bool(enable_cinn_auto_tune);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
@@ -52,12 +54,11 @@ namespace paddle2cinn {
 using ir::Graph;
 using ir::Node;
 using inference::analysis::Dot;
+using ::cinn::auto_schedule::AutoTuner;
 using ::cinn::common::Target;
-using ::cinn::common::Float;
-using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::frontend::Optimize;
 using ::cinn::hlir::framework::BuildScope;
-using ::cinn::frontend::ProgramPass;
-using ::cinn::hlir::framework::ApplyPass;
+using ::cinn::hlir::framework::GraphCompiler;
 
 CinnCompiler* CinnCompiler::GetInstance() {
   static CinnCompiler instance;
@@ -68,7 +69,7 @@ const CinnCompiledObject& CinnCompiler::Compile(
     const Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const Target& target, void* stream) {
-  VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
+  VLOG(4) << "-- The graph to be compiled is:\n" << VizGraph(graph);
   CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
                                            target.arch_str());
   CinnCacheKeyByStructure cur_key_by_struct;
@@ -217,6 +218,33 @@ void CinnCompiler::Clear() {
   real_compiled_num_.store(0);
 }
 
+void CinnCompiler::CheckCompiledValid(
+    const ir::Graph& graph,
+    const std::map<std::string, const LoDTensor*>& input_tensors,
+    const CinnCompiledObject& compiled_obj) const {
+  const auto& input_var_names = graph.Get<std::vector<std::string>>(kInputVars);
+  const auto& output_var_names =
+      graph.Get<std::vector<std::string>>(kOutputVars);
+  auto* launch_context = compiled_obj.launch_context.get();
+  // 1. check all of the output variables will be assigned by compiled program
+  for (auto&& var_name : output_var_names) {
+    PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
+                      platform::errors::PreconditionNotMet(
+                          "Variable(%s) not applied in CINN", var_name));
+  }
+  // 2. check all of the used input variables were correctly deduced by CINN.
+  for (const auto& var_name : input_var_names) {
+    // some input variables were not used by CINN because they were eliminated
+    // by its optimized passes or some operators of it need less inputs
+    if (!launch_context->IsVariableUsed(var_name)) {
+      VLOG(4) << "Input variable" << var_name << " not used by cinn";
+      continue;
+    }
+    launch_context->CheckTensorEquivalent(var_name,
+                                          *input_tensors.at(var_name));
+  }
+}
+
 std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -224,36 +252,43 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   CinnGraphSymbolization symbol{compiled_num, graph, target, input_tensors};
   auto frontend_program = symbol();
   auto fetch_ids = symbol.GetFetchIds();
-  ProgramPass::Apply(&frontend_program, fetch_ids, target, {"Decomposer"});
-  ::cinn::frontend::ApplyPass(&frontend_program, fetch_ids, "RemoveIdentity");
-  ::cinn::frontend::ApplyPass(&frontend_program, fetch_ids, "TransposeFolding");
-  ProgramPass::Apply(&frontend_program, fetch_ids, target, {"GemmRewriter"});
+  VLOG(4) << "All fetch var ids in CINN: "
+          << string::join_strings(fetch_ids, ',');
 
-  auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>(
-      frontend_program, target);
-  VLOG(1) << "-- The " << compiled_num << "-th compilation ("
+  auto cinn_graph = Optimize(&frontend_program, fetch_ids, target);
+  VLOG(4) << "-- The " << compiled_num << "-th compilation ("
           << target.arch_str() << "), and its related graph:\n"
           << cinn_graph->Visualize();
-  ApplyPass(cinn_graph.get(), "OpFusion");
-  auto scope = BuildScope(target, cinn_graph);
-
-  VLOG(4) << "All fetch var ids in CINN: "
-          << string::join_strings(fetch_ids, ',');
 
+  auto scope = BuildScope(target, cinn_graph);
   auto graph_compiler =
       std::make_unique<GraphCompiler>(target, scope, cinn_graph);
   GraphCompiler::CompileOptions options;
   options.with_instantiate_variables = false;
+  if (!FLAGS_enable_pe_launch_cinn) {
+    options.with_buffer_handle_instruction_inserted = true;
+  }
+  std::unique_ptr<AutoTuner> auto_tuner;
+  if (FLAGS_enable_cinn_auto_tune) {
+    VLOG(4) << "Compile with auto-tune";
+    auto_tuner = std::make_unique<AutoTuner>(target, cinn_graph.get());
+    auto_tuner->Initialize(AutoTuner::Config(), graph_compiler.get());
+    ::cinn::auto_schedule::TuningOptions tuning_options;
+    tuning_options.num_measure_trials = 0;
+    auto tuning_result = auto_tuner->Tune(tuning_options);
+    options.Apply(tuning_result);
+  }
   auto compiled_res =
       graph_compiler->Build(options, std::move(fetch_ids), stream);
   auto compiled_obj = std::make_unique<CinnCompiledObject>();
-  *compiled_obj = {std::move(graph_compiler),
+  *compiled_obj = {std::move(graph_compiler), std::move(auto_tuner),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
   compiled_obj->cached_index = compiled_num;
   compiled_obj->launch_context =
       std::make_unique<operators::details::CinnLaunchContext>(graph,
                                                               *compiled_obj);
+  CheckCompiledValid(graph, input_tensors, *compiled_obj);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 5fa54b302a36d..7e5df6faf0819 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -37,6 +37,10 @@ class GraphCompiler;
 class Program;
 class Scope;
 }  // namespace hlir::framework
+
+namespace auto_schedule {
+class AutoTuner;
+}  // namespace auto_schedule
 }  // namespace cinn
 
 namespace paddle {
@@ -49,6 +53,7 @@ namespace paddle2cinn {
 
 struct CinnCompiledObject {
   std::unique_ptr<::cinn::hlir::framework::GraphCompiler> compiler;
+  std::unique_ptr<::cinn::auto_schedule::AutoTuner> auto_tuner;
   std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;
@@ -103,6 +108,13 @@ class CinnCompiler {
       const ::cinn::common::Target& target, std::int64_t compiled_num,
       void* stream = nullptr) const;
 
+  // check whether a compiled result is valid by comparing
+  // the consistency of external variables of the subgraph
+  void CheckCompiledValid(
+      const ir::Graph& graph,
+      const std::map<std::string, const LoDTensor*>& input_tensors,
+      const CinnCompiledObject& compiled_obj) const;
+
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
   std::unordered_map<CinnCacheKeyByAddress, std::int64_t, CinnCacheKey::Hash>
       cache_by_address_;
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index e0cf860e5bc7b..e4004c2fbf3b5 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -56,7 +58,12 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   std::vector<int> dev_ids;
   for (int i = 0; i < place_num; ++i) {
     int num = trainer_desc.worker_places(i);
+#ifdef PADDLE_WITH_CUDA
     platform::CUDAPlace place = platform::CUDAPlace(num);
+#endif
+#ifdef PADDLE_WITH_XPU_KP
+    platform::XPUPlace place = platform::XPUPlace(num);
+#endif
     places_.push_back(place);
     dev_ids.push_back(num);
   }
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index d98deb0f188dc..452c960166cb2 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -20,7 +20,9 @@ limitations under the License. */
 
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
 
 #if defined _WIN32 || defined __APPLE__
 #else
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 0463f5788f154..c95159d572733 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -289,6 +289,11 @@ void Scope::DelListener(const std::shared_ptr<ScopeListener>& listener) {
   listeners_.remove(listener);
 }
 
+bool Scope::HasListener(const std::shared_ptr<ScopeListener>& listener) {
+  auto it = std::find(listeners_.begin(), listeners_.end(), listener);
+  return it != listeners_.end();
+}
+
 void Scope::EraseVarsExcept(const std::unordered_set<Variable*>& vars) {
   SCOPE_VARS_WRITER_LOCK
   for (auto iter = vars_.begin(); iter != vars_.end();) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 1669fba1327e5..9231ec90e8f88 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -154,6 +154,8 @@ class Scope : public ScopeBase {
 
   void DelListener(const std::shared_ptr<ScopeListener>& listener);
 
+  bool HasListener(const std::shared_ptr<ScopeListener>& listener);
+
  protected:
   struct KeyHasher {
     std::size_t operator()(const std::string& key) const {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e8cd84248ea85..1159280762f5a 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -51,7 +51,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
   auto src_place = src.place();
   auto src_ptr = src.data();
 #ifdef PADDLE_WITH_MKLDNN
-  dst->set_format(src.format());
+  dst->set_mem_desc(src.mem_desc());
   // oneDNN tensors due to padding may be of bigger size
   // than numel()*size(type())
   auto dst_ptr =
@@ -61,6 +61,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
 #else
   auto dst_ptr = dst->mutable_data(dst_place, src.dtype());
 #endif
+  dst->set_layout(src.layout());
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
             << dst_place;
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 69cd45222cef4..107bbdf09a021 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -7,8 +7,13 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator
 ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api)
 add_subdirectory(jit)
+if (WITH_GPU)
+cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info phi_gpu_info)
+else()
+cc_library(layout_autotune SRCS layout_autotune.cc DEPS op_info)
+endif()
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper layout_autotune)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune)
 cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 756f26dcefff1..eea4c67582613 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -211,6 +211,14 @@ class VarBase {
 
   framework::proto::VarType::Type DataType() const { return var_->DataType(); }
 
+  void SetDataLayout(paddle::experimental::DataLayout data_layout) {
+    var_->SetDataLayout(data_layout);
+  }
+
+  paddle::experimental::DataLayout DataLayout() const {
+    return var_->DataLayout();
+  }
+
   size_t ElementSize() const { return framework::SizeOfType(var_->DataType()); }
 
   void SetForwardDataType(framework::proto::VarType::Type data_type) {
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
new file mode 100644
index 0000000000000..ed0526eaad316
--- /dev/null
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/imperative/layout_transformer.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+
+namespace paddle {
+namespace imperative {
+
+bool LayoutAutoTune::UseLayoutAutoTune() const {
+#if defined(PADDLE_WITH_CUDA)
+  if (!phi::backends::gpu::TensorCoreAvailable()) {
+    LOG(INFO) << "Layout AutoTuning is not available.";
+    return false;
+  } else {
+    return use_layout_autotune_;
+  }
+#else
+  return false;
+#endif
+}
+
+LayoutAutoTune::LayoutAutoTune() {
+  const auto& op_info = paddle::framework::OpInfoMap::Instance().map();
+  for (auto it = op_info.begin(); it != op_info.end(); it++) {
+    // only record forwrd operators
+    if (it->first.find("_grad") != std::string::npos) {
+      continue;
+    }
+
+    // some normalization operators such as instance_norm and layer_norm
+    // do not have data_format attr, but are layout sensitive.
+    if (it->first.find("norm") != std::string::npos) {
+      layout_agnostic_ops_.emplace(it->first);
+      continue;
+    }
+
+    auto* attr_checker = it->second.Checker();
+    if (attr_checker) {
+      auto attrs = attr_checker->GetDefaultAttrMap();
+      if (attrs.find("data_format") != attrs.end() ||
+          attrs.find("data_layout") != attrs.end()) {
+        VLOG(4) << "Heavily layout sensitive OP: " << it->first;
+        heavily_layout_sensitive_ops_.emplace(it->first);
+        continue;
+      }
+
+      // Attribute name is fuzzy matched, such as start and start_axis.
+      bool layout_agnostic = true;
+      for (auto& attr : attrs) {
+        auto attr_name = attr.first;
+        VLOG(6) << "OP: " << it->first << " Attr Name: " << attr_name;
+        if (attr_name.find("axis") != std::string::npos ||
+            attr_name.find("axes") != std::string::npos ||
+            attr_name.find("dim") != std::string::npos ||
+            attr_name.find("start") != std::string::npos ||
+            attr_name.find("end") != std::string::npos) {
+          VLOG(4) << "Lightly layout sensitive OP: " << it->first;
+          layout_agnostic = false;
+          lightly_layout_sensitive_ops_.emplace(it->first);
+          break;
+        }
+      }
+
+      if (layout_agnostic) {
+        VLOG(4) << "Layout agnostic_ops: " << it->first;
+        layout_agnostic_ops_.emplace(it->first);
+      }
+    }
+  }
+
+  VLOG(3) << "The number of layout agnostic OPs: "
+          << layout_agnostic_ops_.size() << ", heavily layout sensitive OPs: "
+          << heavily_layout_sensitive_ops_.size()
+          << ", lightly layout sensitive OPs: "
+          << lightly_layout_sensitive_ops_.size();
+}
+
+template <typename VarType>
+paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
+    const std::string& op_type,
+    const paddle::imperative::NameVarMap<VarType>& ins,
+    const paddle::imperative::NameVarMap<VarType>& outs,
+    paddle::framework::AttributeMap* attrs,
+    const std::shared_ptr<imperative::Tracer>& tracer) {
+  if (!LayoutAutoTune::Instance().UseLayoutAutoTune()) {
+    return ins;
+  }
+
+  // When layout autotuning is enabled, the tuner will check the desired layout.
+  // (1) If the desired layout is undefined, and there is no convolutional
+  // layers, layout optimization is unnecessary. Otherwise, the desired layout
+  // will be set to the best layout only when these is a convolutional layer
+  // with
+  // NCHW-Layout and the TensorCore is available.
+  // (2) If the desired layout is defined, run the transposer.
+
+  if (LayoutAutoTune::Instance().GetDesiredLayout() == DataLayout::UNDEFINED) {
+    // Layout autotune only supports model with convolutional layers
+    if (op_type != "conv2d") {
+      return ins;
+    } else {
+      if (BOOST_GET_CONST(std::string, (*attrs)["data_format"]) == "NCHW") {
+        LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NHWC);
+        VLOG(3) << "Tune the layout from "
+                << BOOST_GET_CONST(std::string, (*attrs)["data_format"])
+                << " to " << paddle::framework::DataLayoutToString(
+                                 LayoutAutoTune::Instance().GetDesiredLayout());
+      } else {
+        LayoutAutoTune::Instance().DisableLayoutAutoTune();
+        return ins;
+      }
+    }
+  }
+
+  std::shared_ptr<LayoutTransformer<VarType>> transposer = nullptr;
+  if (op_type == "conv2d") {
+    transposer =
+        std::make_shared<HeavilyLayoutSensitiveOpTransformer<VarType>>(op_type);
+    transposer->SetArguments({"Input"}, {"Output"}, {"data_format"});
+  } else if (op_type == "batch_norm") {
+    transposer =
+        std::make_shared<HeavilyLayoutSensitiveOpTransformer<VarType>>(op_type);
+    transposer->SetArguments({"X"}, {"Y"}, {"data_layout"});
+  } else if (op_type == "pool2d") {
+    transposer =
+        std::make_shared<HeavilyLayoutSensitiveOpTransformer<VarType>>(op_type);
+    transposer->SetArguments({"X"}, {"Out"}, {"data_format"});
+  } else if (op_type == "transpose2") {
+    transposer = std::make_shared<TransposeOpTransformer<VarType>>(op_type);
+  } else if (op_type == "flatten_contiguous_range") {
+    transposer = std::make_shared<FlattenOpTransformer<VarType>>(op_type);
+  } else if (op_type.find("elementwise_") != std::string::npos) {
+    transposer = std::make_shared<ElementwiseOpTransformer<VarType>>(op_type);
+  } else if (LayoutAutoTune::Instance().IsLayoutAgnostic(op_type)) {
+    transposer = std::make_shared<LayoutTransformer<VarType>>(op_type);
+  } else if (LayoutAutoTune::Instance().IsLightlyLayoutSensitive(op_type)) {
+    transposer =
+        std::make_shared<LightlyLayoutSensitiveOpTransformer<VarType>>(op_type);
+  } else {
+    PADDLE_ENFORCE_NOT_NULL(
+        transposer, phi::errors::Unimplemented(
+                        "%s 's LayoutTransformer is unimplemented.", op_type));
+  }
+
+  return transposer->Apply(ins, outs, attrs, tracer);
+}
+template paddle::imperative::NameVarMap<VarBase> AutoTuneLayout<VarBase>(
+    const std::string& op_type,
+    const paddle::imperative::NameVarMap<VarBase>& ins,
+    const paddle::imperative::NameVarMap<VarBase>& outs,
+    paddle::framework::AttributeMap* attrs,
+    const std::shared_ptr<imperative::Tracer>& tracer);
+template paddle::imperative::NameVarMap<egr::EagerVariable>
+AutoTuneLayout<egr::EagerVariable>(
+    const std::string& op_type,
+    const paddle::imperative::NameVarMap<egr::EagerVariable>& ins,
+    const paddle::imperative::NameVarMap<egr::EagerVariable>& outs,
+    paddle::framework::AttributeMap* attrs,
+    const std::shared_ptr<imperative::Tracer>& tracer);
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h
new file mode 100644
index 0000000000000..679612fdf1ae3
--- /dev/null
+++ b/paddle/fluid/imperative/layout_autotune.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+#include <memory>
+#include <unordered_set>
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/compat/type_defs.h"
+
+namespace paddle {
+namespace imperative {
+
+class Tracer;
+
+using DataLayout = paddle::experimental::DataLayout;
+
+class LayoutAutoTune {
+ public:
+  static LayoutAutoTune& Instance() {
+    static LayoutAutoTune layout_autoTune;
+    return layout_autoTune;
+  }
+
+  bool UseLayoutAutoTune() const;
+
+  void EnableLayoutAutoTune() { use_layout_autotune_ = true; }
+
+  void DisableLayoutAutoTune() { use_layout_autotune_ = false; }
+
+  bool IsLightlyLayoutSensitive(const std::string& op_type) const {
+    return lightly_layout_sensitive_ops_.count(op_type) != 0;
+  }
+
+  bool IsLayoutAgnostic(const std::string& op_type) const {
+    return layout_agnostic_ops_.count(op_type) != 0;
+  }
+
+  DataLayout GetDesiredLayout() const { return layout_; }
+
+  void SetDesiredLayout(const DataLayout& layout) { layout_ = layout; }
+
+ private:
+  LayoutAutoTune();
+
+  bool use_layout_autotune_{false};
+
+  std::unordered_set<std::string> layout_agnostic_ops_{};
+
+  std::unordered_set<std::string> heavily_layout_sensitive_ops_{};
+
+  std::unordered_set<std::string> lightly_layout_sensitive_ops_{};
+
+  DataLayout layout_{DataLayout::UNDEFINED};
+};
+
+template <typename VarType>
+paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
+    const std::string& op_type,
+    const paddle::imperative::NameVarMap<VarType>& ins,
+    const paddle::imperative::NameVarMap<VarType>& outs,
+    paddle::framework::AttributeMap* attrs,
+    const std::shared_ptr<imperative::Tracer>& tracer);
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
new file mode 100644
index 0000000000000..73e27d4b79b7c
--- /dev/null
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -0,0 +1,332 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/imperative/layout_autotune.h"
+#include "paddle/fluid/imperative/tracer.h"
+#include "paddle/fluid/imperative/var_helper.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+
+namespace paddle {
+namespace imperative {
+
+template <typename VarType>
+std::shared_ptr<VarType> TraceTransposeOp(
+    const std::shared_ptr<VarType>& var, const DataLayout layout,
+    const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+  std::vector<int> axis;
+  if (layout == DataLayout::NHWC) {
+    axis = {0, 2, 3, 1};
+  } else if (layout == DataLayout::NCHW) {
+    axis = {0, 3, 1, 2};
+  } else {
+    axis = {0, 1, 2, 3};
+  }
+  paddle::imperative::NameVarMap<VarType> ins = {{"X", {var}}};
+  auto out =
+      std::shared_ptr<VarType>(new VarType(tracer->GenerateUniqueName()));
+  auto x_shape =
+      std::shared_ptr<VarType>(new VarType(tracer->GenerateUniqueName()));
+  paddle::imperative::NameVarMap<VarType> outs = {{"Out", {out}},
+                                                  {"XShape", {x_shape}}};
+  paddle::framework::AttributeMap attrs = {{"axis", axis}};
+  tracer->TraceOp("transpose2", ins, outs, std::move(attrs));
+  paddle::imperative::SetDataLayout(out, layout);
+  VLOG(4) << "Transpose " << paddle::imperative::GetNameFromVar(var) << "["
+          << paddle::framework::DataLayoutToString(
+                 paddle::imperative::GetDataLayout(var))
+          << "]"
+          << " to " << paddle::imperative::GetNameFromVar(out) << "["
+          << paddle::framework::DataLayoutToString(
+                 paddle::imperative::GetDataLayout(out))
+          << "]";
+  return out;
+}
+
+template <typename VarType>
+class LayoutTransformer {
+ public:
+  explicit LayoutTransformer(const std::string& type) : type_(type) {}
+
+  virtual ~LayoutTransformer() {}
+
+  LayoutTransformer(const LayoutTransformer&) = delete;
+  LayoutTransformer& operator=(const LayoutTransformer&) = delete;
+
+  virtual paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    VLOG(3) << "Optimze Layout agnostic op: " << type_;
+    auto in_layout = DataLayout::UNDEFINED;
+    for (auto& pair : ins) {
+      for (auto& var : pair.second) {
+        // Once the any input is desired layout, we set in_layout is desired
+        // layout.
+        if (paddle::imperative::GetDataLayout(var) ==
+            LayoutAutoTune::Instance().GetDesiredLayout()) {
+          in_layout = LayoutAutoTune::Instance().GetDesiredLayout();
+          break;
+        }
+      }
+    }
+    SetVarsLayout(outs, in_layout);
+    return ins;
+  }
+
+  // Set inputs, outputs and attributes to be optimized for the transposer.
+  // Those may respectively be a subset of the corresponding original argument
+  // of the operator.
+  void SetArguments(const std::vector<std::string>& ins,
+                    const std::vector<std::string>& outs,
+                    const std::vector<std::string>& attrs) {
+    ins_ = ins;
+    outs_ = outs;
+    attrs_ = attrs;
+  }
+
+  // Set the variables's layout to the specified layout.
+  // If outs_ is not specified, it means all outputs of the operator
+  // will be considered. Otherwise, it only set layout for the specified output.
+  void SetVarsLayout(const paddle::imperative::NameVarMap<VarType>& outs,
+                     DataLayout layout) const {
+    if (outs_.empty()) {
+      for (auto& pair : outs) {
+        for (auto& var : pair.second) {
+          paddle::imperative::SetDataLayout(var, layout);
+        }
+      }
+    } else {
+      for (auto& name : outs_) {
+        auto out_vars = outs.at(name);
+        for (auto& var : out_vars) {
+          paddle::imperative::SetDataLayout(var, layout);
+        }
+      }
+    }
+  }
+
+  const std::vector<std::string>& Inputs() const { return ins_; }
+  const std::vector<std::string>& Outputs() const { return outs_; }
+  const std::vector<std::string>& Attributes() const { return attrs_; }
+
+  const std::string& Type() { return type_; }
+
+ protected:
+  std::string type_{};
+  std::vector<std::string> ins_{};
+  std::vector<std::string> outs_{};
+  std::vector<std::string> attrs_{};
+};
+
+template <typename VarType>
+class ElementwiseOpTransformer : public LayoutTransformer<VarType> {
+ public:
+  explicit ElementwiseOpTransformer(const std::string& type)
+      : LayoutTransformer<VarType>(type) {}
+
+  paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    // [Why we need the this?]
+    // The Elementwise Ops has a axis attr, it is to support broadcast.
+    // When bias_attr of Conv is not false, the elementwise_add will be
+    // appended, and the axis will be set to the channel dimension.
+
+    // If the axis is set to the channel dimension, the attr transformation
+    // is necessary. Otherwise, it will fall back to the
+    // LayoutTransformer::Apply.
+    auto desired_layout = LayoutAutoTune::Instance().GetDesiredLayout();
+    if (attrs->find("axis") != attrs->end() &&
+        BOOST_GET_CONST(int, (*attrs)["axis"]) != -1) {
+      VLOG(3) << "Optimze layout agnostic op " << this->Type();
+      if (desired_layout == DataLayout::NHWC) {
+        (*attrs)["axis"] = 3;
+      } else if (desired_layout == DataLayout::NCHW) {
+        (*attrs)["axis"] = 1;
+      } else {
+        PADDLE_ENFORCE_EQ(
+            desired_layout, DataLayout::UNDEFINED,
+            phi::errors::PreconditionNotMet("DataLayout is unsupport."));
+      }
+      this->SetVarsLayout(outs, desired_layout);
+      return ins;
+    } else {
+      return LayoutTransformer<VarType>::Apply(ins, outs, attrs, tracer);
+    }
+  }
+};
+
+/*
+ * Both functionality and performance are affected by data layout.
+ * Such as operators with data_format attribute.
+ */
+template <typename VarType>
+class HeavilyLayoutSensitiveOpTransformer : public LayoutTransformer<VarType> {
+ public:
+  explicit HeavilyLayoutSensitiveOpTransformer(const std::string& type)
+      : LayoutTransformer<VarType>(type) {}
+
+  paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    VLOG(3) << "Optimze heavily layout sensitive op " << this->Type();
+    paddle::imperative::NameVarMap<VarType> new_ins(ins);
+
+    // Step 1: Adjust the data_layout attr to the desired layout
+    auto desired_layout = LayoutAutoTune::Instance().GetDesiredLayout();
+    std::string desired_layout_str = paddle::framework::DataLayoutToString(
+        LayoutAutoTune::Instance().GetDesiredLayout());
+    if (attrs->find("data_format") != attrs->end() &&
+        BOOST_GET_CONST(std::string, (*attrs)["data_format"]) !=
+            desired_layout_str) {
+      VLOG(4) << "Origin layout attr: "
+              << BOOST_GET_CONST(std::string, (*attrs)["data_format"])
+              << ", Desired layout attr: " << desired_layout_str;
+      (*attrs)["data_format"] = desired_layout_str;
+    } else if (attrs->find("data_layout") != attrs->end() &&
+               BOOST_GET_CONST(std::string, (*attrs)["data_layout"]) !=
+                   desired_layout_str) {
+      VLOG(4) << "Origin layout attr: "
+              << BOOST_GET_CONST(std::string, (*attrs)["data_layout"])
+              << ", Desired layout attr: " << desired_layout_str;
+      (*attrs)["data_layout"] = desired_layout_str;
+    }
+
+    // Step 2: Transpose the specified input for Op and set the transposed var's
+    // layout.
+    for (auto& name : this->Inputs()) {
+      auto& in_vars = new_ins[name];
+      for (auto& var : in_vars) {
+        auto var_layout = paddle::imperative::GetDataLayout(var);
+        if (var_layout != desired_layout) {
+          var = TraceTransposeOp(var, DataLayout::NHWC, tracer);
+        }
+      }
+    }
+
+    // Step 3: Set the Op's layout sensitive outs var.
+    this->SetVarsLayout(outs, desired_layout);
+
+    return new_ins;
+  }
+};
+
+/*
+ * The functionality may be affected layout transformation before them.
+ * Such as operators with axis attribute.
+ */
+template <typename VarType>
+class LightlyLayoutSensitiveOpTransformer : public LayoutTransformer<VarType> {
+ public:
+  explicit LightlyLayoutSensitiveOpTransformer(const std::string& type)
+      : LayoutTransformer<VarType>(type) {}
+
+  paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    VLOG(3) << "Optimze lightly layout sensitive op " << this->Type();
+    paddle::imperative::NameVarMap<VarType> new_ins(ins);
+    // If input's layout is not tuned, transformation is unnecessary.
+    // If input's layout is already tuned, it will be transformed back to NCHW.
+    // TODO(zhangting): The op of this type should be adapted to the previous
+    // operator output data layout. Currently only a few operators are
+    // supported, and transposers need to be carefully designed to ensure that
+    // they do not cause exceptions.
+    for (auto& pair : new_ins) {
+      for (auto& var : pair.second) {
+        auto var_layout = paddle::imperative::GetDataLayout(var);
+        if (var_layout == LayoutAutoTune::Instance().GetDesiredLayout()) {
+          // Set layout to UNDEFINED so that TransposeOpTransformer do
+          // NHWC->NCHW transformation.
+          var = TraceTransposeOp(var, DataLayout::UNDEFINED, tracer);
+        }
+      }
+    }
+    return new_ins;
+  }
+};
+
+template <typename VarType>
+class TransposeOpTransformer
+    : public LightlyLayoutSensitiveOpTransformer<VarType> {
+ public:
+  explicit TransposeOpTransformer(const std::string& type)
+      : LightlyLayoutSensitiveOpTransformer<VarType>(type) {}
+
+  paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    VLOG(3) << "Optimze lightly layout sensitive op " << this->Type();
+    // When the input layout is the desired format, it means that there
+    // is a transpose layer in the network, it is better to transpose
+    // the result to the original format.
+    // Instead of actually inserting a transpose Op, we fuse the inserted
+    // transpose Op with the current transpose Op by transforming 'axis' attr.
+    auto& in_var = ins.at("X")[0];
+    auto var_layout = paddle::imperative::GetDataLayout(in_var);
+    if (var_layout == LayoutAutoTune::Instance().GetDesiredLayout()) {
+      auto axis = BOOST_GET_CONST(std::vector<int>, (*attrs)["axis"]);
+      // NHWC->NCHW, permutaion will be set as follows.
+      std::vector<int> perm = {0, 3, 1, 2};
+      // fuse the transpose Ops by transforming axis.
+      std::vector<int> fusion_axis = {perm[axis[0]], perm[axis[1]],
+                                      perm[axis[2]], perm[axis[3]]};
+      (*attrs)["axis"] = fusion_axis;
+    }
+    return ins;
+  }
+};
+
+template <typename VarType>
+class FlattenOpTransformer
+    : public LightlyLayoutSensitiveOpTransformer<VarType> {
+ public:
+  explicit FlattenOpTransformer(const std::string& type)
+      : LightlyLayoutSensitiveOpTransformer<VarType>(type) {}
+
+  paddle::imperative::NameVarMap<VarType> Apply(
+      const paddle::imperative::NameVarMap<VarType>& ins,
+      const paddle::imperative::NameVarMap<VarType>& outs,
+      paddle::framework::AttributeMap* attrs,
+      const std::shared_ptr<paddle::imperative::Tracer>& tracer) {
+    VLOG(3) << "Optimze lightly layout sensitive op " << this->Type();
+    // Flatten the C, H, W dimensions will not affect functionality.
+    // So transformation is unnecessary. But in other cases, it needs to
+    // fall back to the LightlyLayoutSensitiveOpTransformer.
+    auto start_axis = BOOST_GET_CONST(int, (*attrs)["start_axis"]);
+    auto stop_axis = BOOST_GET_CONST(int, (*attrs)["stop_axis"]);
+    if (paddle::imperative::GetDataLayout(ins.at("X")[0]) ==
+            LayoutAutoTune::Instance().GetDesiredLayout() &&
+        start_axis == 1 && stop_axis == 3) {
+      return ins;
+    } else {
+      return LightlyLayoutSensitiveOpTransformer<VarType>::Apply(ins, outs,
+                                                                 attrs, tracer);
+    }
+  }
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 1c3a04b51abd0..7bfb3094ba286 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -38,6 +38,7 @@ PD_DECLARE_KERNEL(matmul_with_flatten, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_with_flatten_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 6b20b9b393869..3e2e082fbaa27 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/execution_context.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
@@ -222,16 +223,22 @@ void Tracer::TraceOpImpl(const std::string& type,
   NameVarMap<VarType> new_ins = ins;
   if (amp_level_ == AmpLevel::O1) {
     if (amp_dtype_ == phi::DataType::FLOAT16) {
+      const auto& tracer = imperative::GetCurrentTracer();
+      new_ins =
+          imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs, tracer);
       VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
-      new_ins = AutoCastInputs<VarType>(type, ins);
+      new_ins = AutoCastInputs<VarType>(type, new_ins);
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
       new_ins = AutoCastBF16Inputs<VarType>(type, ins);
     }
   } else if (amp_level_ == AmpLevel::O2) {
     if (amp_dtype_ == phi::DataType::FLOAT16) {
+      const auto& tracer = imperative::GetCurrentTracer();
+      new_ins =
+          imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs, tracer);
       VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
-      new_ins = CastPureFp16Inputs<VarType>(type, ins);
+      new_ins = CastPureFp16Inputs<VarType>(type, new_ins);
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
       new_ins = CastPureBf16Inputs<VarType>(type, ins);
diff --git a/paddle/fluid/imperative/var_helper.cc b/paddle/fluid/imperative/var_helper.cc
index 5f426c72b576d..f84606ba9a4bf 100644
--- a/paddle/fluid/imperative/var_helper.cc
+++ b/paddle/fluid/imperative/var_helper.cc
@@ -190,6 +190,59 @@ template framework::proto::VarType::Type GetDataType<VarBase>(
 template framework::proto::VarType::Type GetDataType<VariableWrapper>(
     std::shared_ptr<VariableWrapper> var);
 
+/* GetDataLayout */
+template <typename VarType>
+paddle::experimental::DataLayout GetDataLayout(std::shared_ptr<VarType> var) {
+  return var->DataLayout();
+}
+template <>
+paddle::experimental::DataLayout GetDataLayout<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> var) {
+  if (var->Var().IsType<framework::LoDTensor>()) {
+    return var->Var().Get<framework::LoDTensor>().layout();
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Only support framework::LoDTensor, but got %s here, please checkout "
+        "var type of "
+        "tensor: %s",
+        paddle::framework::ToTypeName(framework::ToVarType(var->Var().Type())),
+        var->name()));
+  }
+}
+template paddle::experimental::DataLayout GetDataLayout<VarBase>(
+    std::shared_ptr<VarBase> var);
+template paddle::experimental::DataLayout GetDataLayout<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var);
+
+/* SetDataLayout */
+template <typename VarType>
+void SetDataLayout(std::shared_ptr<VarType> var,
+                   const paddle::experimental::DataLayout layout) {
+  var->SetDataLayout(layout);
+}
+template <>
+void SetDataLayout<egr::EagerVariable>(
+    std::shared_ptr<egr::EagerVariable> var,
+    const paddle::experimental::DataLayout layout) {
+  if (var->Var().IsType<framework::LoDTensor>()) {
+    var->MutableVar()->GetMutable<paddle::framework::LoDTensor>()->set_layout(
+        layout);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
+        "Only support framework::LoDTensor, but got %s here, please checkout "
+        "var type of "
+        "tensor: %s",
+        paddle::framework::ToTypeName(framework::ToVarType(var->Var().Type())),
+        var->name()));
+  }
+}
+template void SetDataLayout<VarBase>(
+    std::shared_ptr<VarBase> var,
+    const paddle::experimental::DataLayout layout);
+template void SetDataLayout<VariableWrapper>(
+    std::shared_ptr<VariableWrapper> var,
+    const paddle::experimental::DataLayout layout);
+
 /* CheckCachedKey */
 template <typename VarType>
 bool CheckCachedKey(std::shared_ptr<VarType> var,
diff --git a/paddle/fluid/imperative/var_helper.h b/paddle/fluid/imperative/var_helper.h
index 7c955799f9fa8..9ce456b1103b3 100644
--- a/paddle/fluid/imperative/var_helper.h
+++ b/paddle/fluid/imperative/var_helper.h
@@ -63,6 +63,13 @@ framework::proto::VarType::Type GetType(std::shared_ptr<VarType> var);
 template <typename VarType>
 framework::proto::VarType::Type GetDataType(std::shared_ptr<VarType> var);
 
+template <typename VarType>
+paddle::experimental::DataLayout GetDataLayout(std::shared_ptr<VarType> var);
+
+template <typename VarType>
+void SetDataLayout(std::shared_ptr<VarType> var,
+                   const paddle::experimental::DataLayout layout);
+
 template <typename VarType>
 const std::shared_ptr<VariableWrapper>& GetVariableWrapper(
     const std::shared_ptr<VarType>& var);
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index 1a68bf9af5ece..fae14b41dff54 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/op_base.h"
+#include "paddle/phi/common/layout.h"
 
 namespace paddle {
 namespace imperative {
@@ -186,6 +187,12 @@ class VariableWrapper {
     return fwd_data_type_;
   }
 
+  paddle::experimental::DataLayout DataLayout() { return layout_; }
+
+  void SetDataLayout(const paddle::experimental::DataLayout layout) {
+    layout_ = layout;
+  }
+
   const platform::Place Place() const {
     const framework::Tensor* tensor = nullptr;
     auto place =
@@ -357,6 +364,10 @@ class VariableWrapper {
   // training
   // NOTE: Now no need to support remove void hook
   std::vector<std::shared_ptr<std::function<void()>>> void_hooks_;
+
+  // DataLayout for layoutAutotune
+  paddle::experimental::DataLayout layout_{
+      paddle::experimental::DataLayout::UNDEFINED};
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7badcb395ea70..015f4471a0246 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1077,6 +1077,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         process_level_allocator_enabled = true;
       }
 
+      // TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is
+      // auto_growth
+      if (config.tensorrt_engine_enabled()) {
+        gflags.push_back("--allocator_strategy=naive_best_fit");
+      }
+
       if (framework::InitGflags(gflags)) {
         VLOG(3) << "The following gpu analysis configurations only take effect "
                    "for the first predictor: ";
@@ -1925,11 +1931,29 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
 #endif
   return false;
 }
+
 void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
                                             bool with_interleaved) {
 #ifdef PADDLE_WITH_CUDA
   c->trt_with_interleaved_ = with_interleaved;
 #endif
 }
+
+void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
+#ifdef PADDLE_WITH_CUDA
+  auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
+  paddle::platform::DeviceContextPool &pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
+      pool.Get(pred->place_));
+  cudaStreamSynchronize(dev_ctx->stream());
+#endif
+}
+void InternalUtils::SyncStream(cudaStream_t stream) {
+#ifdef PADDLE_WITH_CUDA
+  cudaStreamSynchronize(stream);
+#endif
+}
+
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index d9992f3fbef9d..e96526730fdea 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -38,6 +38,9 @@
 
 namespace paddle_infer {
 using float16 = paddle::platform::float16;
+namespace experimental {
+class InternalUtils;
+};
 }
 ///
 /// \file analysis_predictor.h
@@ -492,6 +495,7 @@ class AnalysisPredictor : public PaddlePredictor {
   std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
   std::shared_ptr<distributed::TaskNode> task_node_;
 #endif
+  friend class paddle_infer::experimental::InternalUtils;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index 5ac00fd294f43..b4f40194aa947 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -71,7 +71,8 @@ Record ProcessALine(const std::string& line) {
   return record;
 }
 
-void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+void CheckOutput(const std::string& referfile, const PaddleTensor& output,
+                 float threshold = 1e-5) {
   std::string line;
   std::ifstream file(referfile);
   std::getline(file, line);
@@ -93,7 +94,7 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
       for (size_t i = 0; i < numel; ++i) {
         CHECK_LT(
             fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
-            1e-5);
+            threshold);
       }
       break;
     }
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 0b3257da92cd3..818444fbcb648 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -71,7 +71,7 @@ void Main(bool use_gpu) {
   auto& tensor = output.front();
 
   // compare with reference result
-  CheckOutput(FLAGS_refer, tensor);
+  CheckOutput(FLAGS_refer, tensor, 1e-4);
 
   // the analysis_output has some diff with native_output,
   // TODO(luotao): add CheckOutput for analysis_output later.
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 0f26a1076a68c..7461724afb4dd 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -714,4 +714,137 @@ template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
 template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
 #endif
 
+namespace experimental {
+template <typename T>
+void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
+                                            const T *data,
+                                            cudaStream_t stream) {
+  if (t->tensor_ == nullptr) {
+    PADDLE_ENFORCE_EQ(
+        t->name_.empty(), false,
+        paddle::platform::errors::PreconditionNotMet(
+            "Need to SetName first, so that the corresponding tensor can "
+            "be retrieved."));
+    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
+    auto *var = scope->FindVar(t->name_);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, paddle::platform::errors::PreconditionNotMet(
+                 "No tensor called [%s] in the runtime scope", t->name_));
+    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    t->tensor_ = tensor;
+  }
+
+  auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_);
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "You should call Tensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
+  size_t ele_size = tensor->numel() * sizeof(T);
+  if (t->place_ == PlaceType::kCPU) {
+    auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
+    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  } else if (t->place_ == PlaceType::kGPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::platform::CUDAPlace gpu_place(t->device_);
+    auto *t_data = tensor->mutable_data<T>(gpu_place);
+    paddle::memory::Copy(gpu_place, static_cast<void *>(t_data),
+                         paddle::platform::CPUPlace(), data, ele_size, stream);
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with CUDA place because paddle is not compiled "
+        "with CUDA."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "CopyFromCpuWithIoStream only supports CPU and GPU now."));
+  }
+}
+
+template <typename T>
+void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t, T *data,
+                                          cudaStream_t stream) {
+  if (t->tensor_ == nullptr) {
+    PADDLE_ENFORCE_EQ(
+        t->name_.empty(), false,
+        paddle::platform::errors::PreconditionNotMet(
+            "Need to SetName first, so that the corresponding tensor can "
+            "be retrieved."));
+    auto *scope = static_cast<paddle::framework::Scope *>(t->scope_);
+    auto *var = scope->FindVar(t->name_);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, paddle::platform::errors::PreconditionNotMet(
+                 "No tensor called [%s] in the runtime scope", t->name_));
+    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
+    t->tensor_ = tensor;
+  }
+
+  auto *tensor = static_cast<paddle::framework::LoDTensor *>(t->tensor_);
+  auto ele_num = tensor->numel();
+  auto *t_data = tensor->data<T>();
+  auto t_place = tensor->place();
+
+  paddle::framework::Tensor out;
+  auto mem_allocation =
+      std::make_shared<paddle::memory::allocation::Allocation>(
+          static_cast<void *>(data), ele_num * sizeof(T),
+          paddle::platform::CPUPlace());
+  out.ResetHolder(mem_allocation);
+
+  if (paddle::platform::is_cpu_place(t_place)) {
+#ifdef PADDLE_WITH_MKLDNN
+    if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN)
+      paddle::framework::innerTransDataLayoutFromMKLDNN(
+          tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls()
+                                .get_cur_paddle_data_layout(),
+          *tensor, &out, paddle::platform::CPUPlace(), true);
+    else
+      std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#else
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#endif
+  } else if (t->place_ == PlaceType::kGPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::memory::Copy(paddle::platform::CPUPlace(),
+                         static_cast<void *>(data), t_place, t_data,
+                         ele_num * sizeof(T), stream);
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with CUDA place because paddle is not compiled "
+        "with CUDA."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "CopyToCpuWithIoStream only supports CPU and GPU now."));
+  }
+}
+
+template void InternalUtils::CopyFromCpuWithIoStream<float>(
+    paddle_infer::Tensor *t, const float *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<int64_t>(
+    paddle_infer::Tensor *t, const int64_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<int32_t>(
+    paddle_infer::Tensor *t, const int32_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<uint8_t>(
+    paddle_infer::Tensor *t, const uint8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<int8_t>(
+    paddle_infer::Tensor *t, const int8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyFromCpuWithIoStream<float16>(
+    paddle_infer::Tensor *t, const float16 *data, cudaStream_t stream);
+
+template void InternalUtils::CopyToCpuWithIoStream<float>(
+    paddle_infer::Tensor *t, float *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<int64_t>(
+    paddle_infer::Tensor *t, int64_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<int32_t>(
+    paddle_infer::Tensor *t, int32_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<uint8_t>(
+    paddle_infer::Tensor *t, uint8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<int8_t>(
+    paddle_infer::Tensor *t, int8_t *data, cudaStream_t stream);
+template void InternalUtils::CopyToCpuWithIoStream<float16>(
+    paddle_infer::Tensor *t, float16 *data, cudaStream_t stream);
+
+}  // namespace experimental
+
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index d25f51e4fd41e..9c48d822b4d0d 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -888,7 +888,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_gpu_fp16_{false};
   std::unordered_set<std::string> gpu_fp16_disabled_op_types_{
-      "conv2d_fusion", "conv2d", "roll", "strided_slice"};
+      "conv2d_fusion", "conv2d", "roll", "strided_slice", "depthwise_conv2d",
+      "unfold", "generate_proposals_v2", "nearest_interp_v2",
+      "bilinear_interp_v2"
+      "yolo_box",
+      "multiclass_nms3", "matrix_nms"};
 
   bool use_cudnn_{false};
 
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 0f8f9e0a975ba..dc9f7debe5f2f 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -420,8 +420,10 @@ using hipStream_t = struct ihipStream_t*;
 
 namespace paddle_infer {
 class Predictor;
+class Tensor;
 using Config = paddle::AnalysisConfig;
 namespace experimental {
+// Unstable interface, may be modified or deleted in the future.
 class PD_INFER_DECL InternalUtils {
  public:
   // Note: Can only be used under thread_local semantics.
@@ -429,8 +431,18 @@ class PD_INFER_DECL InternalUtils {
                                     cudaStream_t stream);
   static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                     hipStream_t stream);
+
   static void UpdateConfigInterleaved(paddle_infer::Config* c,
                                       bool with_interleaved);
+
+  static void SyncStream(paddle_infer::Predictor* pred);
+  static void SyncStream(cudaStream_t stream);
+  template <typename T>
+  static void CopyFromCpuWithIoStream(paddle_infer::Tensor* t, const T* data,
+                                      cudaStream_t stream);
+  template <typename T>
+  static void CopyToCpuWithIoStream(paddle_infer::Tensor* t, T* data,
+                                    cudaStream_t stream);
 };
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 2afe2d32e2f60..6f99ed6e25a28 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -39,6 +39,10 @@ namespace contrib {
 class TensorUtils;
 }
 
+namespace experimental {
+class InternalUtils;
+};
+
 /// \brief Paddle data type.
 enum DataType {
   FLOAT32,
@@ -198,6 +202,7 @@ class PD_INFER_DECL Tensor {
 #endif
 
   friend class paddle_infer::contrib::TensorUtils;
+  friend class paddle_infer::experimental::InternalUtils;
 #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST)
   friend class paddle_infer::InferApiTesterUtils;
 #endif
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 7824a0f1e29f4..29acf549cbbc3 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -256,19 +256,51 @@ class Pool2dOpConverter : public OpConverter {
 
     if (!adaptive) {
       if (ceil_mode) {
-        std::vector<int> input_shape_v;
-        for (int i = 0; i < input_dims; i++) {
-          input_shape_v.push_back(input_shape.d[i]);
+        if (nv_ksize.d[0] % nv_strides.d[0] == 0 &&
+            nv_ksize.d[1] % nv_strides.d[1] == 0) {
+          nvinfer1::DimsHW pre_pad(0, 0);
+          nvinfer1::DimsHW post_pad(0, 0);
+          // If ceil mode is true, we will pad the appropriate size to the
+          // input.
+          DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad,
+                       &post_pad, input_dims);
+          auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1,
+                                                 pre_pad, post_pad);
+
+          PADDLE_ENFORCE_NOT_NULL(
+              pad_layer, platform::errors::Fatal(
+                             "Pad layer in poolOp converter could not be "
+                             "created. The pointer to pad layer is `NULL`."));
+          input1 = pad_layer->getOutput(0);
+
+          auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
+                                                  nv_pool_type, nv_ksize);
+          PADDLE_ENFORCE_NOT_NULL(
+              pool_layer,
+              platform::errors::Fatal(
+                  "trt pool layer in converter could not be created."));
+          pool_layer->setStride(nv_strides);
+          pool_layer->setPadding(nv_paddings);
+          if (padding_algorithm == "SAME") {
+            pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+          }
+          pool_layer->setAverageCountExcludesPadding(exclusive);
+          layer = pool_layer;
+        } else {
+          std::vector<int> input_shape_v;
+          for (int i = 0; i < input_dims; i++) {
+            input_shape_v.push_back(input_shape.d[i]);
+          }
+          plugin::PoolPlugin *plugin = new plugin::PoolPlugin(
+              ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides,
+              paddings, input_shape_v, real_paddings);
+          auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin);
+          PADDLE_ENFORCE_NOT_NULL(
+              pool_layer,
+              platform::errors::Fatal(
+                  "trt pool plugin layer in converter could not be created."));
+          layer = pool_layer;
         }
-        plugin::PoolPlugin *plugin = new plugin::PoolPlugin(
-            ceil_mode, plugin_pool_type, adaptive, exclusive, ksize, strides,
-            paddings, input_shape_v, real_paddings);
-        auto *pool_layer = engine_->AddPlugin(&input1, 1, plugin);
-        PADDLE_ENFORCE_NOT_NULL(
-            pool_layer,
-            platform::errors::Fatal(
-                "trt pool plugin layer in converter could not be created."));
-        layer = pool_layer;
       } else {
 #if IS_TRT_VERSION_GE(8000)
         // Exclude padding pixels from the average mean is not supported well by
@@ -299,7 +331,6 @@ class Pool2dOpConverter : public OpConverter {
         pool_layer->setAverageCountExcludesPadding(exclusive);
         layer = pool_layer;
       }
-
     } else {
       // Average pooling needs to exclude the padding pixels from the average
       // mean.
@@ -327,5 +358,4 @@ class Pool2dOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP_ITSELF(pool2d);
 REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter);
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index d6826e8710e85..da138fb482e5a 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -1,5 +1,4 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -38,15 +37,25 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     bool global_stats = test_mode || use_global_stats;
     const auto &data_layout_str = ctx.Attr<std::string>("data_layout");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
+    PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The 'data_layout' attribute must be NCHW or NHWC. "
+                          "But recevived 'data_layout' is [%s].",
+                          data_layout_str));
+
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    int temp = x_dims[3];
-    temp = (x_dims.size() != 4) ? 1 : temp;
-    bool is_nchw = (data_layout == DataLayout::kNCHW);
-    const int N = x_dims[0];
-    const int C = is_nchw ? x_dims[1] : temp;
-    const int H = is_nchw ? x_dims[2] : x_dims[1];
-    const int W = is_nchw ? temp : x_dims[2];
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5, true,
+        platform::errors::InvalidArgument(
+            "The size of input's dimensions should be between 2 and 5"
+            "But received: the size of input's dimensions is [%d]",
+            x_dims.size()));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
     const auto *x_data = x->data<T>();
@@ -67,6 +76,7 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
     saved_variance->mutable_data<float>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
+    bool is_nchw = data_layout_str == "NCHW";
 
     if (!global_stats) {
       auto *mean_out_data = mean_out->data<float>();
@@ -83,35 +93,29 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
                                           &mom_cpu);
         momentum = mom_tensor->data<float>()[0];
       }
-      if (C == 1) {
-        int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, 1, H,
-                                   W, epsilon, momentum, scale_data, bias_data,
-                                   saved_mean_data, saved_variance_data,
-                                   mean_out_data, variance_out_data, true);
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::External(
-                "The batch_norm XPU API return wrong value[%d %s]", r,
-                XPUAPIErrorMsg[r]));
-      } else {
-        int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, C, H,
-                                   W, epsilon, momentum, scale_data, bias_data,
-                                   saved_mean_data, saved_variance_data,
-                                   mean_out_data, variance_out_data, is_nchw);
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::External(
-                "The batch_norm XPU API return wrong value[%d %s]", r,
-                XPUAPIErrorMsg[r]));
-      }
+
+      int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, C, H,
+                                 W, epsilon, momentum, scale_data, bias_data,
+                                 saved_mean_data, saved_variance_data,
+                                 mean_out_data, variance_out_data, is_nchw);
+      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                        platform::errors::External(
+                            "The batch_norm XPU API return wrong value[%d %s]",
+                            r, XPUAPIErrorMsg[r]));
     } else {
+      PADDLE_ENFORCE_EQ(
+          data_layout_str == "NCHW", true,
+          platform::errors::InvalidArgument(
+              "The batch_norm_infer 'data_layout' attribute must be NCHW. "
+              "But recevived 'data_layout' is [%s].",
+              data_layout_str));
       const auto *mean = ctx.Input<Tensor>("Mean");
       const auto *variance = ctx.Input<Tensor>("Variance");
       const auto *mean_data = mean->data<float>();
       const auto *variance_data = variance->data<float>();
       int r = xpu::batch_norm_infer(dev_ctx.x_context(), x_data, y_data, N, C,
                                     H, W, epsilon, scale_data, bias_data,
-                                    mean_data, variance_data, true);
+                                    mean_data, variance_data, is_nchw);
       PADDLE_ENFORCE_EQ(
           r, xpu::Error_t::SUCCESS,
           platform::errors::External(
@@ -172,6 +176,13 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     const float epsilon = ctx.Attr<float>("epsilon");
     const auto data_layout = framework::StringToDataLayout(data_layout_str);
 
+    PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC",
+                      true,
+                      platform::errors::InvalidArgument(
+                          "The 'data_layout' attribute must be NCHW or NHWC. "
+                          "But recevived 'data_layout' is [%s].",
+                          data_layout_str));
+
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
@@ -204,13 +215,15 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
     }
 
     const auto &x_dims = x->dims();
-    int temp = x_dims[3];
-    temp = (x_dims.size() != 4) ? 1 : temp;
-    bool is_nchw = (data_layout == DataLayout::kNCHW);
-    const int N = x_dims[0];
-    const int C = is_nchw ? x_dims[1] : temp;
-    const int H = is_nchw ? x_dims[2] : x_dims[1];
-    const int W = is_nchw ? temp : x_dims[2];
+    PADDLE_ENFORCE_EQ(
+        x_dims.size() >= 2 && x_dims.size() <= 5, true,
+        platform::errors::InvalidArgument(
+            "The size of input's dimensions should be between 2 and 5"
+            "But received: the size of input's dimensions is [%d]",
+            x_dims.size()));
+
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     const auto *x_data = x->data<T>();
     const auto *d_y_data = d_y->data<T>();
@@ -235,42 +248,45 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
             "the size of scale's dimensions is [%d], the dimensions of scale "
             "is [%s].",
             scale->dims().size(), scale->dims()));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::InvalidArgument(
+            "The first dimension of scale must equal to Channels[%d]. But "
+            "received: the first dimension of scale is [%d]",
+            C, scale->dims()[0]));
 
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
 
-    const T *mean_data = nullptr;
-    const T *inv_var_data = nullptr;
+    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *batch_inv_std = ctx.Input<Tensor>("SavedVariance");
+    const auto *global_mean = ctx.Input<Tensor>("Mean");
+    const auto *global_var = ctx.Input<Tensor>("Variance");
 
     // TODO(guozibin): hadle the situation case of N * H * W = 1
-    if (!use_global_stats) {
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      // SavedVariance have been reverted in forward operator
-      const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
-      mean_data = saved_mean->data<float>();
-      inv_var_data = saved_inv_variance->data<float>();
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_variance = ctx.Input<Tensor>("Variance");
-      mean_data = running_mean->data<float>();
-      inv_var_data = running_variance->data<float>();
-      float *running_inv_var_data =
-          RAII_GUARD.alloc_l3_or_gm<float>(running_variance->numel());
-      float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
-      int r1 = calculate_inv_var(dev_ctx.x_context(), inv_var_data, epsilon, C,
-                                 epsilon_data, running_inv_var_data);
-      PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
-                                             "XPU API(batch_norm_grad "
-                                             "calculate_inv_var function) "
-                                             "return wrong value[%d %s]",
-                                             r1, XPUAPIErrorMsg[r1]));
-      inv_var_data = running_inv_var_data;
-    }
     if (is_inplace) {
+      float *global_inv_std_data = nullptr;
+      if (use_global_stats) {
+        global_inv_std_data =
+            RAII_GUARD.alloc_l3_or_gm<float>(global_var->numel());
+        float *epsilon_data = RAII_GUARD.alloc_l3_or_gm<float>(1);
+        int r1 =
+            calculate_inv_var(dev_ctx.x_context(), global_var->data<float>(),
+                              epsilon, C, epsilon_data, global_inv_std_data);
+        PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External(
+                                               "XPU API(batch_norm_grad "
+                                               "calculate_inv_var function) "
+                                               "return wrong value[%d %s]",
+                                               r1, XPUAPIErrorMsg[r1]));
+      }
       auto px = *x;
+      auto *inv_std_data =
+          use_global_stats ? global_inv_std_data : batch_inv_std->data<float>();
+      auto mean_data = use_global_stats ? global_mean->data<float>()
+                                        : batch_mean->data<float>();
       int r2 = calculate_inv_BN_Y(
           dev_ctx.x_context(), px.mutable_data<T>(ctx.GetPlace()),
-          scale->data<float>(), bias->data<float>(), mean_data, inv_var_data, N,
+          scale->data<float>(), bias->data<float>(), mean_data, inv_std_data, N,
           C, H * W, x->data<T>());
       PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External(
                                              "XPU API(batch_norm_grad "
@@ -278,19 +294,29 @@ class BatchNormGradXPUKernel : public framework::OpKernel<T> {
                                              "return wrong value[%d %s]",
                                              r2, XPUAPIErrorMsg[r2]));
     }
-    if (!d_x) {
-      d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
-    }
-    if (!d_scale) {
-      d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
-    }
-    if (!d_bias_data) {
-      d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
-    }
 
-    int r3 = xpu::batch_norm_grad<T>(
-        dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, scale_data,
-        mean_data, inv_var_data, d_scale_data, d_bias_data, is_nchw);
+    int r3;
+    bool is_nchw = data_layout_str == "NCHW";
+    if (use_global_stats) {
+      r3 = xpu::batch_norm_grad<T>(
+          dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W,
+          scale_data, nullptr, nullptr, d_scale_data, d_bias_data, is_nchw,
+          global_mean->data<float>(), global_var->data<float>(), epsilon);
+    } else {
+      if (!d_x) {
+        d_x_data = RAII_GUARD.alloc_l3_or_gm<T>(x->numel());
+      }
+      if (!d_scale) {
+        d_scale_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+      }
+      if (!d_bias_data) {
+        d_bias_data = RAII_GUARD.alloc_l3_or_gm<float>(C);
+      }
+      r3 = xpu::batch_norm_grad<T>(
+          dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W,
+          scale_data, batch_mean->data<float>(), batch_inv_std->data<float>(),
+          d_scale_data, d_bias_data, is_nchw);
+    }
     PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External(
                                            "XPU API(batch_norm_grad) return "
                                            "wrong value[%d %s]",
diff --git a/paddle/fluid/operators/bmm_op_xpu.cc b/paddle/fluid/operators/bmm_op_xpu.cc
new file mode 100644
index 0000000000000..cc18558027982
--- /dev/null
+++ b/paddle/fluid/operators/bmm_op_xpu.cc
@@ -0,0 +1,211 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_XPU
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/matmul_v2_op.h"
+
+#include "paddle/fluid/operators/xpu_api_wrapper.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename FCT>
+static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
+                              bool trans_x, bool trans_y,
+                              const paddle::framework::ExecutionContext& ctx) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& x_dims = x->dims();
+  const auto& y_dims = y->dims();
+  auto& dev_ctx =
+      ctx.template device_context<paddle::platform::XPUDeviceContext>();
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
+      RowMatrixFromVector(x_dims), 0, trans_x);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
+      ColumnMatrixFromVector(y_dims), 0, trans_y);
+
+  T* data_c = out->data<T>();
+  int m = mat_dim_a.height_;
+  int n = mat_dim_b.width_;
+  int k = mat_dim_a.width_;
+  int batch_size = mat_dim_a.batch_size_;
+  // batch matmul
+  int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
+      dev_ctx.x_context(),                             // Context* ctx,
+      batch_size,                                      // int batch_size,
+      mat_dim_a.trans_,                                // bool x_trans,
+      mat_dim_b.trans_,                                // bool w_trans,
+      m,                                               // int m,
+      n,                                               // int n,
+      k,                                               // int k,
+      1.0,                                             // float alpha,
+      reinterpret_cast<const XPUType*>(x->data<T>()),  // const TX* x,
+      mat_dim_a.stride_,                               // int stride_a,
+      reinterpret_cast<const XPUType*>(y->data<T>()),  // const TW* w,
+      mat_dim_b.stride_,                               // int stride_b,
+      0.0,                                             // float beta,
+      reinterpret_cast<XPUType*>(data_c),              // TY* y,
+      m * n,                                           // int stride_c,
+      nullptr,                                         // const float* x_maxptr,
+      nullptr);                                        // const float* w_maxptr
+
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_batched");
+}
+
+template <typename T>
+class BmmXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    if (x->numel() == 0 || y->numel() == 0) {
+      return;
+    }
+    bool trans_x = false;
+    bool trans_y = false;
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of BmmOp must be 3-dimensional in BmmOp, "
+                          "but received X's shape: [%s].",
+                          x_dims));
+    PADDLE_ENFORCE_EQ(y_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of BmmOp must be 3-dimensional in BmmOp, "
+                          "but received Y's shape: [%s].",
+                          y_dims));
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], y_dims[0],
+        platform::errors::InvalidArgument(
+            "Input(X) and Input(Y) must have the same batch size in BmmOp, "
+            "but received X's batch size: [%s],"
+            "Y's batch size [%s]",
+            x_dims[0], y_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        x_dims[2], y_dims[1],
+        platform::errors::InvalidArgument(
+            "Input(X)'s width must be equal with Input(Y)'s height in BmmOp,"
+            "but receive X's width: [%s],"
+            "Y's height: [%s].",
+            x_dims[2], y_dims[1]));
+
+    if (std::is_same<paddle::platform::float16, T>::value) {
+      MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
+    } else {
+      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
+        MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
+      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+        MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, ctx);
+      } else {
+        MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
+      }
+    }
+  }
+};
+
+template <typename T>
+class BmmXPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext& ctx,
+              const framework::Tensor& a, bool trans_a,
+              const framework::Tensor& b, bool trans_b,
+              framework::Tensor* out) const {
+    out->mutable_data<T>(ctx.GetPlace());
+    if (std::is_same<paddle::platform::float16, T>::value) {
+      MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
+    } else {
+      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
+        MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
+      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+        MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, ctx);
+      } else {
+        MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
+      }
+    }
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext& context,
+                     const framework::Tensor& a, bool trans_a,
+                     const framework::Tensor& b, bool trans_b,
+                     framework::Tensor* out) const {
+    if (out == nullptr) return;
+    MatMul(context, a, trans_a, b, trans_b, out);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, false, false);
+
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
+
+    CalcInputGrad(context, dout, false, y, true, dx);
+    CalcInputGrad(context, x, true, dout, false, dy);
+
+    // CalcInputGrad(context, dout, false, false, y, true, false, dx);
+    // CalcInputGrad(context, x, true, true, dout, false, true, dy);
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_XPU_KERNEL(bmm, ops::BmmXPUKernel<float>,
+                       ops::BmmXPUKernel<plat::float16>);
+REGISTER_OP_XPU_KERNEL(bmm_grad, ops::BmmXPUGradKernel<float>,
+                       ops::BmmXPUGradKernel<plat::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt
index 2406445e6cfa4..862a0d04fbdfe 100644
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
@@ -3,7 +3,7 @@ include(operators)
 cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context)
 cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor transform_type cinn)
 
-SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
+SET(CINN_OP_DEPS parallel_executor string_helper variable_helper cinn cinn_compiler cinn_op_helper cinn_launch_context transform_type)
 register_operators(DEPS ${CINN_OP_DEPS})
 
 if (WITH_TESTING)
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
index 358d0fc6d078e..68bc3a0eb5c53 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc
@@ -30,7 +30,7 @@ USE_OP_ITSELF(elementwise_add);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace paddle::operators {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index b445527322fd6..a660d59fb4c0f 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -33,6 +33,7 @@
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
@@ -69,13 +70,6 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph,
       graph.Get<std::vector<std::string>>(framework::paddle2cinn::kOutputVars);
   internal_var_names_ =
       ExtractInternalVarNames(input_var_names, output_var_names);
-  // check completeness of output variables in compiled result
-  for (auto&& var_name : output_var_names) {
-    PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true,
-                      platform::errors::PreconditionNotMet(
-                          "Variable(%s) not applied in CINN", var_name));
-  }
-
   // initialize all execution arguments
   InitializeArguments();
   // DEPRECATED(CtfGo): following callback assignment will be deprecated soon
@@ -235,7 +229,7 @@ void CinnLaunchContext::InitializeArguments() {
                         cinn_tensor->shape().data().size());
     cinn_buffer->type = cinn::runtime::ToRuntimeType(cinn_tensor->type());
     VLOG(4) << string::Sprintf(
-        "Append an argument:name(%s),dims(%s),type(%s)",
+        "Append an argument:name(%s),dims(%s),type(%s)", arg,
         framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(),
         cinn_tensor->type());
     name2argument_.emplace(arg, cinn_buffer.get());
@@ -400,7 +394,20 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
   std::unordered_map<Scope*, Scope*> scope_map = {
       {parallel_executor_->GetLocalScopes().front(), scope}};
   parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map);
-  parallel_executor_->PrepareVariables(scope);
+  // instead of using the PrepareVariables function of ParallelExecutor to
+  // initialize all variables, here we only initialize internal variables
+  // because external variables are already included in parent scope.
+  for (auto&& var_name : internal_var_names_) {
+    auto* var = scope->FindVar(var_name);
+    if (var != nullptr) {
+      VLOG(5) << "internal variable:" << var_name
+              << " has been initialized beforehand in global scope, skipped.";
+      continue;
+    }
+    framework::InitializeVariable(scope->Var(var_name),
+                                  framework::proto::VarType::LOD_TENSOR);
+  }
+
   for (auto&& var_name : initialized_beforehand_vars_) {
     auto* var = scope->GetVar(var_name);
     auto* buffer = GetCinnBufferOfVar(var_name);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index 15ea9a6926afb..ecbfbf2f92ebf 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <set>
 #include <utility>
+#include "cinn/auto_schedule/auto_tuner.h"
 #include "cinn/common/target.h"
 #include "cinn/common/type.h"
 #include "cinn/hlir/framework/graph_compiler.h"
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 5263aae03ed3f..024bf2bceb3d0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -18,7 +18,9 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "cinn/common/target.h"
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -26,6 +28,7 @@
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 
+DECLARE_bool(enable_pe_launch_cinn);
 namespace paddle {
 namespace operators {
 
@@ -101,34 +104,23 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
         compilation_key, inputs_name2tensor, target, stream);
     details::DebugCinnCompiledResult(cinn_compiled_object);
-
     auto* launch_context = cinn_compiled_object.launch_context.get();
-    // Step 3. check the computational consistency of the subgraph
-    //         before and after the compilation
-    // 3.1 Input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_x_variable_names) {
-      // some input variables don't need for cinn because they are
-      // eliminated by optimized passes or some cinn operators use
-      // less variables
-      if (!launch_context->IsVariableUsed(var_name)) {
-        VLOG(4) << "Input variable" << var_name << " not used by cinn";
-        continue;
-      }
-      launch_context->CheckTensorEquivalent(var_name,
-                                            *inputs_name2tensor.at(var_name));
-    }
 
-    // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
+    // Step 3. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
     details::SetCinnRuntimeFlags();
 
-    // Step 5. use PE to execute the compiled CINN instructions
-    //         in nodes of the runtime graph
-    VLOG(4) << "Execute the runtime graph by PE";
-    framework::Scope& exec_scope = scope.NewScope();
-    auto* pe = launch_context->InitializePE(place, &exec_scope);
-    pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+    // Step 4. Execute the compiled CINN instructions by a PE or
+    //         by the CINN compiled program in sequential order
+    if (FLAGS_enable_pe_launch_cinn) {
+      VLOG(4) << "Execute the runtime graph by PE";
+      framework::Scope& exec_scope = scope.NewScope();
+      auto* pe = launch_context->InitializePE(place, &exec_scope);
+      pe->RunWithoutFetch(launch_context->GetSkipEagerVars());
+    } else {
+      VLOG(4) << "Execute the compiled executable program";
+      launch_context->UpdateCapturedEnv(scope, place);
+      LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
+    }
     VLOG(4) << "CinnLaunchOp launch execution done.";
   }
 };
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 585f1caabed05..b0bd043f43247 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -32,56 +32,90 @@ USE_OP(cinn_launch);
 USE_OP(cinn_instruction_run);
 USE_OP_ITSELF(elementwise_add);
 DECLARE_double(eager_delete_tensor_gb);
+DECLARE_bool(enable_pe_launch_cinn);
+DECLARE_bool(enable_cinn_auto_tune);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #ifdef PADDLE_WITH_CUDA
-PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 namespace paddle::operators {
 
 using framework::paddle2cinn::CinnCompiler;
 
-TEST(CinnLaunchOpTest, TestWithElementwiseAdd) {
-  paddle::framework::InitDevices();
-  platform::SetNumThreads(1);
-  // cache test graph into CinnCompiler
-  const std::string& test_op_out_name = "cinn_launch_op_out";
-  const std::string& add_op_out_name = "add_op_out";
-  auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
-      CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
-
-  // create cinn_launch_op and elementwise_add op
-  auto cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
-      "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
-      {{"compilation_key", compilation_key}});
-  auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
-      "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
-      {{"Out", {add_op_out_name}}}, {{}});
-
-  // Run ops and check the computation results
-  auto run_and_check_fn = [&](const platform::Place& place) {
+class TestCinnLaunchOp : public ::testing::Test {
+ public:
+  const char* test_op_out_name = "add_op_out";
+  const char* add_op_out_name = "add_op_out";
+  std::unique_ptr<framework::OperatorBase> cinn_launch_op;
+  std::unique_ptr<framework::OperatorBase> elementwise_add_op;
+
+  void SetUp() override {
+    paddle::framework::InitDevices();
+    platform::SetNumThreads(1);
+    // cache test graph into CinnCompiler
+    auto compilation_key = CinnCompiler::GetInstance()->AddGraph(
+        CreateOnlyElementwiseAddGraph("x", "y", test_op_out_name));
+
+    // create cinn_launch_op and elementwise_add op
+    cinn_launch_op = paddle::framework::OpRegistry::CreateOp(
+        "cinn_launch", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}},
+        {{"compilation_key", compilation_key}});
+    elementwise_add_op = paddle::framework::OpRegistry::CreateOp(
+        "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}},
+        {{"Out", {add_op_out_name}}}, {{}});
+  }
+
+  void RunAndCheck(const platform::Place& place) {
+    // Run ops and check the computation results
     framework::Scope scope;
     InitVariablesWithRandomValue<float>({"x", "y"}, {10, 20}, place, &scope);
     scope.Var(test_op_out_name)->GetMutable<LoDTensor>();
     scope.Var(add_op_out_name)->GetMutable<LoDTensor>();
-    cinn_launch_op->Run(scope, place);
     elementwise_add_op->Run(scope, place);
+    cinn_launch_op->Run(scope, place);
     CompareOpResult<float>(scope.GetVar(test_op_out_name),
                            scope.GetVar(add_op_out_name));
-  };
-  FLAGS_eager_delete_tensor_gb = -1;
+  }
+
+  void TearDown() override { CinnCompiler::GetInstance()->Clear(); }
+};
 
+TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) {
   // CPU
-  run_and_check_fn(platform::CPUPlace());
-  run_and_check_fn(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
+  // the second run on the same place is to check the cache logic
+  RunAndCheck(platform::CPUPlace());
 #ifdef PADDLE_WITH_CUDA
   // GPU
-  run_and_check_fn(platform::CUDAPlace());
-  run_and_check_fn(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
 #endif
 }
 
+TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) {
+  // set FLAGS_enable_pe_launch_cinn=false to switch to use
+  // default scheduler of CINN to execute the compiled program
+  FLAGS_enable_pe_launch_cinn = false;
+
+  RunAndCheck(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  // GPU
+  RunAndCheck(platform::CUDAPlace());
+  RunAndCheck(platform::CUDAPlace());
+#endif
+}
+
+TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) {
+  FLAGS_enable_cinn_auto_tune = true;
+
+  // currently only check on cpu, will add a test for gpu after CINN ready
+  RunAndCheck(platform::CPUPlace());
+  RunAndCheck(platform::CPUPlace());
+}
+
 namespace details {
 // Testing helper function used on CinnLaunchOpKernel in the following:
 // firstly build test data, then check both expected and illegal situations
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 86a6ec2c3a160..3ace825e7b80d 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -20,6 +20,29 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using NPUDeviceContext = platform::NPUDeviceContext;
+static void CastToFP16(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream, const Tensor& in,
+                       Tensor* out) {
+  out->mutable_data<paddle::platform::float16>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Cast")
+      .AddInput(in)
+      .AddOutput(*out)
+      .AddAttr("dst_type", ACL_FLOAT16)
+      .Run(stream);
+}
+
+static void CastToFP32(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream, const Tensor& in,
+                       Tensor* out) {
+  out->mutable_data<float>(ctx.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("Cast")
+      .AddInput(in)
+      .AddOutput(*out)
+      .AddAttr("dst_type", ACL_FLOAT)
+      .Run(stream);
+}
 
 template <typename T>
 class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
@@ -356,18 +379,33 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
 
     auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     if (filter_grad) {
-      filter_grad->mutable_data<float>(ctx.GetPlace());
+      filter_grad->mutable_data<T>(ctx.GetPlace());
       std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
 
+      Tensor filter_grad_fp32(experimental::DataType::FLOAT32);
+      filter_grad_fp32.Resize(filter_grad->dims());
+
+      if (framework::TransToProtoVarType(input->dtype()) ==
+          framework::proto::VarType::FP16) {
+        CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32);
+      } else {
+        filter_grad_fp32.ShareDataWith(*filter_grad);
+      }
+
       const auto& runner = NpuOpRunner(
           "Conv2DBackpropFilterD", {input_tensor, output_grad_tensor},
-          {*filter_grad}, {{"filter_size", filter_shape_vec},
-                           {"strides", strides_vec},
-                           {"pads", paddings},
-                           {"dilations", dilations_vec},
-                           {"groups", groups},
-                           {"data_format", data_format}});
+          {filter_grad_fp32}, {{"filter_size", filter_shape_vec},
+                               {"strides", strides_vec},
+                               {"pads", paddings},
+                               {"dilations", dilations_vec},
+                               {"groups", groups},
+                               {"data_format", data_format}});
       runner.Run(stream);
+
+      if (framework::TransToProtoVarType(input->dtype()) ==
+          framework::proto::VarType::FP16) {
+        CastToFP16(ctx, stream, filter_grad_fp32, filter_grad);
+      }
     }
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/cumsum_op_mlu.cc b/paddle/fluid/operators/cumsum_op_mlu.cc
new file mode 100644
index 0000000000000..9abe793f4dc26
--- /dev/null
+++ b/paddle/fluid/operators/cumsum_op_mlu.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CumSumMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    int axis = ctx.Attr<int>("axis");
+    bool exclusive = ctx.Attr<bool>("exclusive");
+    bool reverse = ctx.Attr<bool>("reverse");
+    bool flatten = ctx.Attr<bool>("flatten");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor* input_ptr = const_cast<Tensor*>(x);
+    Tensor flat_x(x->type());
+    if (flatten) {
+      PADDLE_ENFORCE_EQ(
+          axis, -1,
+          platform::errors::InvalidArgument(
+              "when flatten is true, attr axis must be default %d, but got %d",
+              -1, axis));
+
+      flat_x.ShareDataWith(*x);
+      flat_x.Resize(phi::make_ddim({x->numel()}));
+      input_ptr = &flat_x;
+    }
+
+    const int true_axis = (axis < 0) ? input_ptr->dims().size() + axis : axis;
+    MLUCnnlTensorDesc input_desc(*input_ptr);
+    MLUCnnlTensorDesc out_desc(*out);
+
+    MLUCnnl::Cumsum(ctx, true_axis, exclusive, reverse, input_desc.get(),
+                    GetBasePtr(input_ptr), out_desc.get(), GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(cumsum, ops::CumSumMLUKernel<int>,
+                       ops::CumSumMLUKernel<float>,
+                       ops::CumSumMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index a1fe8a25665ec..702ff3bfd87b0 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 6daf05a9d778d..fe898a6c41c2a 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -21,7 +21,9 @@
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index c28abb916b7a7..d77d4ed036394 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 // only can include the headers in paddle/phi/include dirs
-#include "paddle/phi/kernels/elementwise_grad_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
index 47a549dfcde28..98d559df233b3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_mlu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
 
 namespace paddle {
 namespace operators {
@@ -23,35 +22,7 @@ template <typename T>
 class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                           y_dims_array.data(), out_dims_array.data(), max_dim,
-                           axis);
-
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(),
-                             ToCnnlDataType(x->type()));
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(),
-                             ToCnnlDataType(y->type()));
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(),
-                                       CNNL_NOT_PROPAGATE_NAN);
-
-    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
-                      y_desc.get(), GetBasePtr(y), out_desc.get(),
-                      GetBasePtr(out), ToCnnlDataType<T>());
+    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_ADD);
   }
 };
 
@@ -75,22 +46,8 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
       if (dx->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec;
         std::vector<int> reduce_axes;
-        auto src_dims = dx->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (dst_dims_vec.size() == 0) {
-          // x is scalar
-          dst_dims_vec.push_back(1);
-        }
+        GetReduceAxesAndDstDims(axis, dout->dims(), dx->dims(), &reduce_axes,
+                                &dst_dims_vec);
 
         MLUCnnlReduceDesc reduction_desc(
             reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
@@ -109,22 +66,8 @@ class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
       if (dy->dims() != dout->dims()) {
         std::vector<int> dst_dims_vec;
         std::vector<int> reduce_axes;
-        auto src_dims = dy->dims();
-        auto dout_dims = dout->dims();
-
-        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
-        for (int ax = 0; ax < dout_dims.size(); ++ax) {
-          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
-              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
-            reduce_axes.push_back(ax);
-          } else {
-            dst_dims_vec.push_back(dout_dims[ax]);
-          }
-        }
-        if (dst_dims_vec.size() == 0) {
-          // y is scalar
-          dst_dims_vec.push_back(1);
-        }
+        GetReduceAxesAndDstDims(axis, dout->dims(), dy->dims(), &reduce_axes,
+                                &dst_dims_vec);
 
         MLUCnnlReduceDesc reduction_desc(
             reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
new file mode 100644
index 0000000000000..156cea81c0f63
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -0,0 +1,207 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MLU
+#include <vector>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+inline void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
+                          const framework::DDim& target_ddims,
+                          std::vector<int>* axes) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+  for (int64_t i = 0; i < src_dim_size; ++i) {
+    if (i < axis || i >= target_dim_size + axis) {
+      axes->push_back(i);
+      continue;
+    }
+    if (src_ddims[i] > target_ddims[i - axis]) {
+      axes->push_back(i);
+    }
+  }
+}
+
+inline void GetReduceAxesAndDstDims(const int axis,
+                                    const framework::DDim& src_ddims,
+                                    const framework::DDim& target_ddims,
+                                    std::vector<int>* reduce_axes,
+                                    std::vector<int>* dst_dims_vec) {
+  int64_t src_dim_size = src_ddims.size();
+  int64_t target_dim_size = target_ddims.size();
+
+  int src_axis = (target_dim_size < src_dim_size ? axis : 0);
+  for (int ax = 0; ax < src_dim_size; ++ax) {
+    if ((ax < src_axis || ax >= src_axis + target_dim_size) ||
+        (src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) {
+      reduce_axes->push_back(ax);
+    } else {
+      dst_dims_vec->push_back(src_ddims[ax]);
+    }
+  }
+  if (dst_dims_vec->size() == 0) {
+    // target_var is scalar
+    dst_dims_vec->push_back(1);
+  }
+}
+
+template <typename T>
+void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
+                       const cnnlOpTensorDesc_t op_tensor_op) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(ctx.GetPlace()), true,
+      platform::errors::Unavailable("This kernel only runs on MLU."));
+  PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) ||
+                        (op_tensor_op == CNNL_OP_TENSOR_SUB) ||
+                        (op_tensor_op == CNNL_OP_TENSOR_MUL),
+                    true,
+                    platform::errors::Unavailable(
+                        "This kernel of MLU only support ADD, SUB, MUL."));
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+
+  int axis = ctx.Attr<int>("axis");
+  const auto& x_dims = x->dims();
+  const auto& y_dims = y->dims();
+  axis =
+      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+
+  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out);
+  MLUCnnlOpTensorDesc op_tensor_desc(op_tensor_op, ToCnnlDataType<T>(),
+                                     CNNL_NOT_PROPAGATE_NAN);
+
+  MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
+                    y_desc.get(), GetBasePtr(y), out_desc.get(),
+                    GetBasePtr(out), ToCnnlDataType<T>());
+}
+
+// ------------------ BinaryOp -----------------
+enum BINARY_FUNCTOR {
+  DIV,
+  DIVNONAN,
+};
+
+template <BINARY_FUNCTOR func>
+void MLUBinary(const framework::ExecutionContext& ctx,
+               cnnlComputationPreference_t prefer,
+               const cnnlTensorDescriptor_t x_desc, const void* x,
+               const cnnlTensorDescriptor_t y_desc, const void* y,
+               const cnnlTensorDescriptor_t out_desc, void* out);
+
+template <>
+inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
+                           cnnlComputationPreference_t prefer,
+                           const cnnlTensorDescriptor_t x_desc, const void* x,
+                           const cnnlTensorDescriptor_t y_desc, const void* y,
+                           const cnnlTensorDescriptor_t out_desc, void* out) {
+  MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
+}
+
+template <BINARY_FUNCTOR Functor, typename T>
+void MLUBinaryOp(const framework::ExecutionContext& ctx) {
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Output<Tensor>("Out");
+  out->mutable_data<T>(ctx.GetPlace());
+
+  int axis = ctx.Attr<int>("axis");
+  const auto& x_dims = x->dims();
+  const auto& y_dims = y->dims();
+  axis =
+      (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+
+  MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
+  MLUBinary<Functor>(ctx, prefer_type, x_desc.get(), GetBasePtr(x),
+                     y_desc.get(), GetBasePtr(y), out_desc.get(),
+                     GetBasePtr(out));
+}
+
+// ------------------ UnaryOp -----------------
+enum UNARY_FUNCTOR {
+  NEG,
+  RECIPROCAL,
+};
+
+template <UNARY_FUNCTOR func>
+void MLUUnary(const framework::ExecutionContext& ctx,
+              cnnlComputationPreference_t prefer,
+              const cnnlTensorDescriptor_t input_desc, const void* input,
+              const cnnlTensorDescriptor_t ouput_desc, void* output);
+
+template <>
+inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
+                          cnnlComputationPreference_t prefer,
+                          const cnnlTensorDescriptor_t input_desc,
+                          const void* input,
+                          const cnnlTensorDescriptor_t output_desc,
+                          void* output) {
+  MLUCnnl::Neg(ctx, input_desc, input, output_desc, output);
+}
+
+template <>
+inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
+                                 cnnlComputationPreference_t prefer,
+                                 const cnnlTensorDescriptor_t input_desc,
+                                 const void* input,
+                                 const cnnlTensorDescriptor_t output_desc,
+                                 void* output) {
+  MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output);
+}
+
+template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
+void MLUUnaryOp(const framework::ExecutionContext& ctx) {
+  auto* x = ctx.Input<Tensor>("X");
+  auto* out = ctx.Output<Tensor>("Out");
+
+  out->mutable_data<Tout>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tin>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tout>());
+
+  cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
+  MLUUnary<Functor>(ctx, prefer_type, x_desc.get(), GetBasePtr(x),
+                    out_desc.get(), GetBasePtr(out));
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
index a7505890f41d4..33603fd73f49c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
 
 namespace paddle {
 namespace operators {
@@ -21,53 +20,11 @@ namespace operators {
 using Tensor = framework::Tensor;
 using MLUDeviceContext = platform::MLUDeviceContext;
 
-static void GetReduceAxes(const int axis, const framework::DDim& src_ddims,
-                          const framework::DDim& target_ddims,
-                          std::vector<int>* axes) {
-  int64_t src_dim_size = src_ddims.size();
-  int64_t target_dim_size = target_ddims.size();
-  for (int64_t i = 0; i < src_dim_size; ++i) {
-    if (i < axis || i >= target_dim_size + axis) {
-      axes->push_back(i);
-      continue;
-    }
-    if (src_ddims[i] > target_ddims[i - axis]) {
-      axes->push_back(i);
-    }
-  }
-}
-
 template <typename T>
 class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    int axis = ctx.Attr<int>("axis");
-    const auto& x_dims = x->dims();
-    const auto& y_dims = y->dims();
-    axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
-                     : axis);
-    int max_dim = std::max(x_dims.size(), y_dims.size());
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                           y_dims_array.data(), out_dims_array.data(), max_dim,
-                           axis);
-
-    MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
-    MLUCnnlTensorDesc out_desc(*out);
-    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
-                                       CNNL_NOT_PROPAGATE_NAN);
-
-    MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x),
-                      y_desc.get(), GetBasePtr(y), out_desc.get(),
-                      GetBasePtr(out), ToCnnlDataType<T>());
+    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_MUL);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
new file mode 100644
index 0000000000000..7c3d09effa4b1
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_mlu.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_SUB);
+  }
+};
+
+template <typename T>
+class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
+
+    MLUCnnlTensorDesc dout_desc(*dout);
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      if (dx->dims() != dout->dims()) {
+        std::vector<int> dst_dims_vec;
+        std::vector<int> reduce_axes;
+        GetReduceAxesAndDstDims(axis, dout->dims(), dx->dims(), &reduce_axes,
+                                &dst_dims_vec);
+
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dx_desc(dst_dims_vec.size(), dst_dims_vec.data(),
+                                  ToCnnlDataType<T>());
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(dout), 0, nullptr,
+                        nullptr, dx_desc.get(), GetBasePtr(dx));
+      } else {
+        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      if (dy->dims() != dout->dims()) {
+        std::vector<int> dst_dims_vec;
+        std::vector<int> reduce_axes;
+        GetReduceAxesAndDstDims(axis, dout->dims(), dy->dims(), &reduce_axes,
+                                &dst_dims_vec);
+
+        MLUCnnlReduceDesc reduction_desc(
+            reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType<T>(),
+            CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+        MLUCnnlTensorDesc dy_desc(dst_dims_vec.size(), dst_dims_vec.data(),
+                                  ToCnnlDataType<T>());
+        MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(),
+                        nullptr, dout_desc.get(), GetBasePtr(dout), 0, nullptr,
+                        nullptr, dy_desc.get(), GetBasePtr(dy));
+        tmp_dout = dy;
+      }
+
+      // call neg op, dy = -dout
+      MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout);
+      MLUCnnlTensorDesc dy_desc(*dy);
+
+      MLUUnary<NEG>(ctx, CNNL_COMPUTATION_HIGH_PRECISION, tmp_dout_desc.get(),
+                    GetBasePtr(tmp_dout), dy_desc.get(), GetBasePtr(dy));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(elementwise_sub, ops::ElementwiseSubMLUKernel<int>,
+                       ops::ElementwiseSubMLUKernel<float>,
+                       ops::ElementwiseSubMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradMLUKernel<int>,
+                       ops::ElementwiseSubGradMLUKernel<float>,
+                       ops::ElementwiseSubGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index ce5c6b701d958..f28aae9eed37b 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -26,6 +26,9 @@
 USE_OP_ITSELF(elementwise_add);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
+#endif
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 582f0627b2044..bfa663d59b95c 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -17,10 +17,13 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
+using float16 = paddle::platform::float16;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
                         ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
-                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, float16>);
 REGISTER_OP_CUDA_KERNEL(
     fake_channel_wise_dequantize_max_abs,
     ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float>,
-    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>);
+    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, double>,
+    ops::FakeChannelWiseDequantizeMaxAbsKernel<CUDA, float16>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 5416ae11c2b56..42361407a0f0a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -19,17 +19,22 @@ namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 using float16 = paddle::platform::float16;
 REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
-                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>,
+                        ops::FakeQuantizeAbsMaxKernel<CUDA, float16>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_abs_max,
                         ops::FakeQuantizeDequantizeAbsMaxKernel<CUDA, float>,
                         ops::FakeQuantizeDequantizeAbsMaxKernel<CUDA, float16>);
-REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
-                        ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_quantize_abs_max,
+    ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>,
+    ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float16>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
-                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
+                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>,
+                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float16>);
 REGISTER_OP_CUDA_KERNEL(
     fake_quantize_moving_average_abs_max,
-    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
+    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>,
+    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float16>);
 REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
                         ops::MovingAverageAbsMaxScaleKernel<CUDA, float>,
                         ops::MovingAverageAbsMaxScaleKernel<CUDA, float16>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index ae448b7ff2c8b..6c068d25d07a8 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -24,6 +24,16 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct QuantizeDataType {
+  using type = T;
+};
+
+template <>
+struct QuantizeDataType<paddle::platform::float16> {
+  using type = float;
+};
+
 template <typename T>
 __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -87,10 +97,12 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n,
   int tid = threadIdx.x;
   int channel_size = n / c;
   const T* in_c = in + blockIdx.x * channel_size;
-  extern __shared__ T shared_max_data[];
+  extern __shared__ char* shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
   T local_max_data = T(0);
   for (int i = tid; i < channel_size; i += blockDim.x) {
-    T tmp = fabs(in_c[i]);
+    T tmp = static_cast<T>(
+        fabs(static_cast<typename QuantizeDataType<T>::type>(in_c[i])));
     if (tmp > local_max_data) {
       local_max_data = tmp;
     }
@@ -112,7 +124,8 @@ template <typename T>
 __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
                                                   const int cin, const int cout,
                                                   T* out) {
-  extern __shared__ T shared_max_data[];
+  extern __shared__ char* shared_max_data_tmp[];
+  auto shared_max_data = reinterpret_cast<T*>(shared_max_data_tmp);
   int cout_wh_size = n / cin;
   int wh_size = n / (cin * cout);
 
@@ -121,7 +134,8 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n,
   const T* in_current = in + tid * cout_wh_size + bid * wh_size;
   T local_max_data = T(0);
   for (int i = 0; i < wh_size; i++) {
-    T tmp = fabs(in_current[i]);
+    T tmp = static_cast<T>(
+        fabs(static_cast<typename QuantizeDataType<T>::type>(in_current[i])));
     if (tmp > local_max_data) {
       local_max_data = tmp;
     }
@@ -205,12 +219,14 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
 
   T s = scale[0];
   T inv_s = inverse(s);
+  T bin_cnt_t = static_cast<T>(bin_cnt);
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
     T x = in[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out[i] = round(v);
+    v = bin_cnt_t * inv_s * v;
+    out[i] = static_cast<T>(
+        round(static_cast<typename QuantizeDataType<T>::type>(v)));
   }
 }
 
@@ -230,7 +246,8 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
     x = x > s ? s : x;
     x = x < -s ? -s : x;
     x = bin_cnt_t * inv_s * x;
-    x = static_cast<T>(round(static_cast<float>(x)));
+    x = static_cast<T>(
+        round(static_cast<typename QuantizeDataType<T>::type>(x)));
     out[i] = (x * s) / bin_cnt_t;
   }
 }
@@ -287,13 +304,15 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
 
   T s = scale[blockIdx.x];
   T inv_s = inverse(s);
+  T bin_cnt_t = static_cast<T>(bin_cnt);
 
   for (int64_t i = tid; i < channel_size; i += blockDim.x) {
     T x = in_c[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out_c[i] = round(v);
+    v = bin_cnt_t * inv_s * v;
+    out_c[i] = static_cast<T>(
+        round(static_cast<typename QuantizeDataType<T>::type>(v)));
   }
 }
 
@@ -303,14 +322,16 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(
     const T* in, const T* scale, const int bin_cnt, const int64_t n,
     const int nScale, const int quant_stride, T* out) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+  T bin_cnt_t = static_cast<T>(bin_cnt);
   for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
     T s = scale[(i / quant_stride) % nScale];
     T inv_s = inverse(s);
     T x = in[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
-    v = bin_cnt * inv_s * v;
-    out[i] = round(v);
+    v = bin_cnt_t * inv_s * v;
+    out[i] = static_cast<T>(
+        round(static_cast<typename QuantizeDataType<T>::type>(v)));
   }
 }
 
@@ -376,7 +397,8 @@ __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
   scale_arr[idx] = cur;
   T max = last_scale[0];
   out_scale[0] = max < cur ? cur : max;
-  if (fabs(removed - max) < 1e-6) {
+  if (fabs(static_cast<typename QuantizeDataType<T>::type>(removed - max)) <
+      1e-6) {
     need_find_max[0] = 1;
     out_size[0] = it > window_size ? window_size : it;
   } else {
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index e5ebdad1e4434..61b80219a26b4 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -33,6 +33,7 @@ USE_OP_ITSELF(elementwise_add);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, KPS, ALL_LAYOUT);
 #endif
 
 // get paddle matmul op results as baseline
diff --git a/paddle/fluid/operators/fill_any_like_op_mlu.cc b/paddle/fluid/operators/fill_any_like_op_mlu.cc
new file mode 100644
index 0000000000000..24f5c3c29970c
--- /dev/null
+++ b/paddle/fluid/operators/fill_any_like_op_mlu.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class FillAnyLikeMLUKernel : public framework::OpKernel<T> {
+ public:
+  using CommonType = typename std::common_type<
+      float,
+      typename std::conditional<std::is_same<T, platform::float16>::value,
+                                float, T>::type>::type;
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    float value = ctx.Attr<float>("value");
+
+    auto common_type_value = static_cast<CommonType>(value);
+
+    PADDLE_ENFORCE_EQ(
+        (common_type_value >=
+         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
+            (common_type_value <=
+             static_cast<CommonType>(std::numeric_limits<T>::max())),
+        true,
+        platform::errors::InvalidArgument(
+            "The filled value is out of range for target type, "
+            "current kernel type is %s, the range should between %f "
+            "and %f, but now value is %f.",
+            typeid(T).name(),
+            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
+            static_cast<CommonType>(std::numeric_limits<T>::max()), value));
+
+    PADDLE_ENFORCE_EQ(
+        std::isnan(value), false,
+        platform::errors::InvalidArgument("The filled value is NaN."));
+
+    auto value_t = static_cast<T>(value);
+    MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, out_desc.get(),
+                  GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(fill_any_like, ops::FillAnyLikeMLUKernel<int>,
+                       ops::FillAnyLikeMLUKernel<int64_t>,
+                       ops::FillAnyLikeMLUKernel<float>,
+                       ops::FillAnyLikeMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index 54e4cbdc16249..6eb5881112f89 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/functors.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -117,6 +119,18 @@ class FMHARef {
       v_ptr = k_ptr + k_size;
     }
 
+    {
+      // NOTE(wangxi): We scale Q with 1/sqrt(Dh) before QK^T, because for
+      // float16 calculation, INF may appear in QK^T if we do not scale before.
+      float alpha = 1.0 / sqrt(head_dim_);
+      auto q_tensor = transpose_2_out_tensor->Slice(0, 1);
+      auto functor = phi::funcs::ScaleFunctor<T>(alpha);
+      std::vector<const framework::Tensor*> ins = {&q_tensor};
+      std::vector<framework::Tensor*> outs = {&q_tensor};
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx_, ins,
+                                                                &outs, functor);
+    }
+
     // q*k^t, batched_gemm
     CBLAS_TRANSPOSE transA = CblasNoTrans;
     CBLAS_TRANSPOSE transB = CblasTrans;
@@ -125,7 +139,7 @@ class FMHARef {
     int gemm_m = seq_len_;
     int gemm_n = out_seq_len;
     int gemm_k = head_dim_;
-    T alpha = static_cast<T>(1.0 / sqrt(head_dim_));
+    T alpha = static_cast<T>(1.0);
     T beta = static_cast<T>(0.0);
     int64_t stride_a = gemm_m * gemm_k;
     int64_t stride_b = gemm_k * gemm_n;
@@ -300,7 +314,9 @@ class FMHARef {
     }
 
     T* qk_out_grad_data = qk_out_grad_tensor->data<T>();
-    alpha = static_cast<T>(1.0 / sqrt(head_dim_));
+    // NOTE(wangxi): For we scale Q with 1/sqrt(Dh) in forward, so we set
+    //   alpha = 1.0 in backward.
+    alpha = static_cast<T>(1.0);
     // recall batchedgemm(nt) fw:  q_ptr * (k_ptr)^t = qk_out
     // bw: dy (seq_len * head_dim) = (dout)^t * x
     transA = CblasTrans;
@@ -314,6 +330,7 @@ class FMHARef {
                      qk_out_grad_data, q_ptr, beta, k_grad_ptr, gemm_batch_size,
                      stride_a, stride_b);
     // dx (seq_len * head_dim) = dout * y
+    alpha = static_cast<T>(1.0 / sqrt(head_dim_));
     transA = CblasNoTrans;
     transB = CblasNoTrans;
     gemm_m = seq_len_;
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu
index 2ba5c027a4d76..c4ab4de8a64cb 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cu
@@ -13,6 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 // this file is inspired by:
 // https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/fused_kernels/scaled_masked_softmax.h
+/* Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 3bebbee1fb7cc..d4c5b8877056f 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -12,6 +12,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 // this file is inspired by:
 // https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+/* Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
diff --git a/paddle/fluid/operators/gather_op_mlu.cc b/paddle/fluid/operators/gather_op_mlu.cc
new file mode 100644
index 0000000000000..220d045952643
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_mlu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto axis = ctx.Attr<int>("axis");
+
+    auto *out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc index_desc(*index);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::GatherFunctor(ctx, axis, 0 /*batch_dims*/, x_desc.get(),
+                           GetBasePtr(x), index_desc.get(), GetBasePtr(index),
+                           out_desc.get(), GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class GatherGradOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc dx_desc(*dx);
+    auto value = static_cast<T>(0);
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(),
+                  GetBasePtr(dx));
+
+    MLUCnnlTensorDesc index_desc(*index);
+    MLUCnnlTensorDesc dout_desc(*dout);
+    const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
+    MLUCnnl::ScatterFunctor(ctx, dx_desc.get(), GetBasePtr(dx), dout_desc.get(),
+                            GetBasePtr(dout), index_desc.get(),
+                            GetBasePtr(index), mode);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_MLU_KERNEL(gather, ops::GatherOpMLUKernel<float>,
+                       ops::GatherOpMLUKernel<paddle::platform::float16>,
+                       ops::GatherOpMLUKernel<int>);
+
+REGISTER_OP_MLU_KERNEL(gather_grad, ops::GatherGradOpMLUKernel<float>,
+                       ops::GatherGradOpMLUKernel<paddle::platform::float16>,
+                       ops::GatherGradOpMLUKernel<int>);
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index c0a4b88fc76fd..5b5ddddaafb24 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -130,7 +130,9 @@ __forceinline__ __device__ U BlockReduceSum(U val, U *shared) {
                                             ##__VA_ARGS__)
 
 static __device__ __forceinline__ float real_sqrt(float x) { return sqrtf(x); }
-static __device__ __forceinline__ double real_sqrt(double x) { return sqrt(x); }
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return ::sqrt(x);
+}
 
 template <typename T>
 struct PairForLayerNorm {
@@ -162,7 +164,7 @@ __inline__ __device__ float rsqrt_(const float val) {
 
 template <>
 __inline__ __device__ double rsqrt_(const double val) {
-  return rsqrt(val);
+  return ::rsqrt(val);
 }
 
 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
diff --git a/paddle/fluid/operators/logspace_op.cc b/paddle/fluid/operators/logspace_op.cc
new file mode 100644
index 0000000000000..1d1653b053679
--- /dev/null
+++ b/paddle/fluid/operators/logspace_op.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
+
+namespace paddle {
+namespace operators {
+
+class LogspaceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class LogspaceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "Exponent of first entry in the sequence. It is a tensor of "
+             "shape [1], should be of type int32, int64, float32 or float64.");
+    AddInput("Stop",
+             "Exponent of last entry in the sequence. It is a tensor of "
+             "shape [1], should be of type int32, int64, float32 or float64.");
+    AddInput("Num",
+             "Number of entry in the sequence. It is a tensor of shape [1], "
+             "should be of type int32.");
+    AddInput("Base",
+             "Base of the logarithm function. It is a tensor of shape [1], "
+             "should be of type int32, int64, float32 or float64.");
+    AddAttr<int>("dtype", "The output data type.");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+        Return fixed number of logarithmical-evenly spaced values within a given 
+        interval. First entry is exponential of Start with base Base, and last 
+        entry is exponential of Stop with base Base. In the case when Num is 1, 
+        only exponential of Start with base Base is returned. If dtype is int32 
+        or int64, the decimal part of values will be truncated. 
+        Like logspace function of numpy.
+    )DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(logspace, LogspaceInferShapeFunctor,
+                            PD_INFER_META(phi::LogspaceInferMeta));
+REGISTER_OPERATOR(
+    logspace, ops::LogspaceOp, ops::LogspaceOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    LogspaceInferShapeFunctor);
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index d6170b7000d63..8ef3d60c0dc0c 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/funcs/tril_triu_compute.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
index 363166518b586..e0775e88f0c80 100644
--- a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
@@ -47,9 +47,9 @@ class CastMKLDNNKernel : public framework::OpKernel<T> {
                                                    dev_ctx.GetEngine());
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->format(), platform::to_void_cast(x->data<T>()));
-    auto reorder_dst_memory_p =
-        reorder_handler.AcquireDstMemory(out, x->format(), dev_ctx.GetPlace());
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        out, x->mem_desc(), dev_ctx.GetPlace());
     auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
                                                     reorder_src_memory_p);
 
@@ -58,7 +58,7 @@ class CastMKLDNNKernel : public framework::OpKernel<T> {
     astream.wait();
 
     out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc());
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
index fc37022272270..7a81e90e455d3 100644
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
@@ -45,20 +45,19 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
       out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
     }
 
-    dnnl::memory::format_tag x_format_tag = x->format();
+    dnnl::memory::desc x_mem_desc = x->mem_desc();
     if (x_vec_dims.size() != out_new_dims.size()) {
-      x_format_tag =
-          GetExtendedFormatTag(x_vec_dims, out_new_dims.size(), x_format_tag);
+      x_mem_desc = GetExtendedMemoryDescriptor(x_mem_desc, x_vec_dims,
+                                               out_new_dims.size());
     }
 
     out->Resize(phi::make_ddim(out_new_dims));
-    out->set_format(x_format_tag);
     paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
         dnnl::algorithm::binary_add, onednn_engine, ctx.GetPlace(), out, x,
-        0.0f, 1.0f, x_vec_dims);
+        0.0f, 1.0f, x_mem_desc);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);
+    auto dst_memory_p = handler.AcquireDstMemory(out);  // acquires zeroed mem
     auto binary_p = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
@@ -70,22 +69,18 @@ class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
     binary_p->execute(astream, args);
     astream.wait();
 
-    out->set_layout(paddle::framework::DataLayout::kMKLDNN);
-    out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
+    out->set_mem_desc(dst_memory_p->get_desc());
   }
 
  private:
-  dnnl::memory::format_tag GetExtendedFormatTag(
-      std::vector<int64_t>& dims, int new_size,  // NOLINT
-      dnnl::memory::format_tag format_tag) const {
-    dnnl::memory::desc md(dims, paddle::platform::MKLDNNGetDataType<T>(),
-                          format_tag);
+  dnnl::memory::desc GetExtendedMemoryDescriptor(
+      const dnnl::memory::desc& x_mem_desc,
+      const std::vector<int64_t>& x_vec_dims, int new_size) const {
     std::vector<int64_t> new_dims(new_size, 1);
-    std::copy(dims.begin(), dims.end(),
-              new_dims.begin() + new_size - dims.size());
+    std::copy(x_vec_dims.begin(), x_vec_dims.end(),
+              new_dims.begin() + new_size - x_vec_dims.size());
 
-    dims = std::move(new_dims);
-    return paddle::platform::GetMKLDNNFormat(md.reshape(dims));
+    return x_mem_desc.reshape(new_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index b647264d93f64..2a8627b803a6e 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -15,26 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-static dnnl::memory::format_tag get_plain_format_tag(
-    const paddle::framework::Tensor* tensor) {
-  auto tensor_dims_size = tensor->dims().size();
-
-  switch (tensor_dims_size) {
-    case 1:
-      return dnnl::memory::format_tag::a;
-    case 2:
-      return dnnl::memory::format_tag::ab;
-    case 3:
-      return dnnl::memory::format_tag::abc;
-    case 4:
-      return dnnl::memory::format_tag::abcd;
-    case 5:
-      return dnnl::memory::format_tag::abcde;
-  }
-
-  return dnnl::memory::format_tag::abcdef;
-}
-
 namespace paddle {
 namespace operators {
 
@@ -105,11 +85,12 @@ class SliceMKLDNNKernel : public framework::OpKernel<T> {
         onednn_engine);
 
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->format(), platform::to_void_cast(x->data<T>()));
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
     auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
                                                         reorder_src_memory_p);
     auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        out, slice_dims, get_plain_format_tag(x), ctx.GetPlace());
+        out, slice_dims, platform::GetPlainMKLDNNFormat(x_vec_dims.size()),
+        ctx.GetPlace());
 
     auto reorder_p =
         reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
@@ -133,9 +114,7 @@ class SliceMKLDNNKernel : public framework::OpKernel<T> {
 
     astream.wait();
     out->Resize(phi::make_ddim(new_out_dims));
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(
-        reorder_dst_memory_p->get_desc().reshape(new_out_dims)));
+    out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(new_out_dims));
   }
 };
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
index 29872ad397020..d4dbff537b960 100644
--- a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
@@ -101,7 +101,7 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> {
         x_vec_dims, framework::TransToProtoVarType(x->dtype()), x_type,
         onednn_engine);
     auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->format(), platform::to_void_cast(x->data<T>()));
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
 
     for (size_t i = 0; i < outs_number; ++i) {
       auto out_vec_dims = phi::vectorize(outs[i]->dims());
@@ -117,8 +117,7 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> {
 
       offset[axis] += num > 0 ? x->dims()[axis] / num : sections[i];
 
-      outs[i]->set_layout(framework::DataLayout::kMKLDNN);
-      outs[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+      outs[i]->set_mem_desc(reorder_dst_memory_p->get_desc());
     }
     astream.wait();
   }
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index ecde4db3f334e..793aa2644b548 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -934,9 +934,8 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
     beta_ptr = static_cast<const void*>(&beta_int);
   }
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetOpTensorWorkspaceSize_v2(
-      handle, op_tensor_desc, alpha1_ptr, a_desc, a, alpha2_ptr, b_desc, b,
-      beta_ptr, output_desc, output, &workspace_size));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetOpTensorWorkspaceSize(
+      handle, a_desc, b_desc, output_desc, &workspace_size));
 
   auto& dev_ctx = GetDevCtxFromCTX(ctx);
   Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 00ad618329c99..9948c45e24692 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -45,6 +45,22 @@ enum MLULogicMethod {
   CNNL_LOGIC_OP_OR = 7,
 };
 
+const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
+    {"reduce_all", CNNL_REDUCE_AND},  {"reduce_any", CNNL_REDUCE_OR},
+    {"reduce_max", CNNL_REDUCE_MAX},  {"reduce_mean", CNNL_REDUCE_AVG},
+    {"reduce_min", CNNL_REDUCE_MIN},  {"reduce_sum", CNNL_REDUCE_ADD},
+    {"reduce_prod", CNNL_REDUCE_MUL},
+};
+
+inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) {
+  auto iter = MLUReduceOpMap.find(reduce_name);
+  if (iter != MLUReduceOpMap.end()) {
+    return iter->second;
+  }
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "Not support reduce op type of MLU Device: %s", reduce_name));
+}
+
 inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
 
 inline void* GetBasePtr(Tensor* t) { return t->data(); }
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
index b84a2bc579d3e..54ead6d3df7f0 100644
--- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc
@@ -118,11 +118,11 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
                   GetBasePtr(&mu_tensor));
 
     for (size_t idx = 0; idx < n; ++idx) {
-      RegularizationType regularization_flag =
+      phi::RegularizationType regularization_flag =
           regularization_methods.size() > 0 &&
                   regularization_methods[idx] == "l2_decay"
-              ? RegularizationType::kL2DECAY
-              : RegularizationType::kNONE;
+              ? phi::RegularizationType::kL2DECAY
+              : phi::RegularizationType::kNONE;
       T regularization_coeff = static_cast<T>(0.0);
       if (regularization_coeffs.size() != 0) {
         regularization_coeff = static_cast<T>(regularization_coeffs[idx]);
@@ -135,7 +135,7 @@ class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
       auto grad = grads[idx];
       Tensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param_out);
-      if (regularization_flag == RegularizationType::kL2DECAY) {
+      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
         regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
             param_out->dims(), dev_ctx);
         MLUCnnlOpTensorDesc op_tensor_desc(
diff --git a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
index 71af14fd91c8c..b8fa81b2e7123 100644
--- a/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op_mlu.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
 
 namespace paddle {
 namespace operators {
@@ -27,10 +28,10 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
     std::string regularization_method =
         ctx.Attr<std::string>("regularization_method");
     auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
-    RegularizationType regularization_flag{
-        RegularizationType::kNONE};  // disable regularization
+    phi::RegularizationType regularization_flag{
+        phi::RegularizationType::kNONE};  // disable regularization
     if (regularization_method == "l2_decay") {
-      regularization_flag = RegularizationType::kL2DECAY;
+      regularization_flag = phi::RegularizationType::kL2DECAY;
     }
 
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
@@ -57,7 +58,7 @@ class MLUMomentumOpKernel : public framework::OpKernel<T> {
 
       Tensor regularized_grad;
       MLUCnnlTensorDesc param_desc(*param);
-      if (regularization_flag == RegularizationType::kL2DECAY) {
+      if (regularization_flag == phi::RegularizationType::kL2DECAY) {
         regularized_grad =
             ctx.AllocateTmpTensor<T, MLUDeviceContext>(param->dims(), dev_ctx);
         MLUCnnlOpTensorDesc op_tensor_desc(
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index fa88d128a9a1d..c1bcf82c33256 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -116,11 +116,16 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
         framework::Tensor extra_device_tensor =
             ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
                 {static_cast<int64_t>(extra_input_size)}, dev_ctx);
-        // TODO(fwg): use Async copy, and add a callback to stream that free
-        // host
-        // memory.
-        framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(),
-                                  &extra_device_tensor);
+        framework::TensorCopy(extra_host_tensor, ctx.GetPlace(),
+                              &extra_device_tensor);
+        // Increase extra_host_tensor holder_ reference count until copy
+        // complete.
+        auto increase_ref_count = [extra_host_tensor]() {
+          VLOG(4) << "Finished copying extra_host_tensor["
+                  << GetBasePtr(&extra_host_tensor)
+                  << "] in mlu pooling kernel.";
+        };
+        dev_ctx.AddStreamCallback(increase_ref_count);
         MLUCnnl::PoolingForward(
             ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/,
             in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/,
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index 90e4fc9da0d61..22b43910e6967 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -132,5 +132,7 @@ REGISTER_OPERATOR(pull_box_sparse, ops::PullBoxSparseOp,
                   ops::PushBoxSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushBoxSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel<float>)
+REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(pull_box_sparse, ops::PullBoxSparseXPUKernel<float>);
+REGISTER_OP_XPU_KERNEL(push_box_sparse, ops::PushBoxSparseXPUKernel<float>);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu
index 96a1b1c08b79c..e3407dd3b2e8b 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_sparse_op.cu
@@ -11,7 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/operators/pull_box_sparse_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
@@ -38,7 +37,7 @@ class PushBoxSparseCUDAKernel : public framework::OpKernel<T> {
 };
 }  // namespace operators
 }  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel<float>)
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel<float>)
+
+REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel<float>);
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 77021b8961db5..2bde9725abdca 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -114,5 +114,21 @@ class PushBoxSparseCPUKernel : public framework::OpKernel<T> {
     PushBoxSparseFunctor<T>(ctx);
   }
 };
+
+template <typename T>
+class PullBoxSparseXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PullBoxSparseFunctor<T>(ctx);
+  }
+};
+
+template <typename T>
+class PushBoxSparseXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PushBoxSparseFunctor<T>(ctx);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index eeb320e3f1a39..0c174b0825c9f 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -77,10 +77,10 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
           input_type, onednn_engine);
 
       auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-          input->format(), platform::to_void_cast(input->data<T>()));
+          input->mem_desc(), platform::to_void_cast(input->data<T>()));
 
       auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          output, input->format(), ctx.GetPlace());
+          output, input->mem_desc(), ctx.GetPlace());
 
       auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
                                                       reorder_dst_memory_p);
@@ -88,10 +88,8 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
 
-      output->set_layout(framework::DataLayout::kMKLDNN);
-      output->set_format(
-          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape(
-              phi::vectorize<int64_t>(output->dims()))));
+      output->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(output->dims())));
     } else {
       platform::ReductionMKLDNNHandler<T> handler(reduction_type, 0.0f, 0.0f,
                                                   onednn_engine, ctx.GetPlace(),
@@ -107,10 +105,8 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
 
       reduction_p->execute(astream, reduction_args);
       astream.wait();
-      output->set_layout(framework::DataLayout::kMKLDNN);
-      output->set_format(
-          platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
-              phi::vectorize<int64_t>(output->dims()))));
+      output->set_mem_desc(dst_memory_p->get_desc().reshape(
+          phi::vectorize<int64_t>(output->dims())));
     }
   }
 };
@@ -128,37 +124,25 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
     bool keep_dim = ctx.Attr<bool>("keep_dim");
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     auto dims = ctx.Attr<std::vector<int>>("dim");
-    auto* input_dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output_dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    dnnl::memory::format_tag x_format_tag;
-    auto input_dims =
-        CalculateReducedDims(output_dx, input_dy, dims, reduce_all, keep_dim);
-    auto output_dims = phi::vectorize(output_dx->dims());
+    const auto input_dims =
+        CalculateReducedDims(dx, dout, dims, reduce_all, keep_dim);
+    const auto output_dims = phi::vectorize(dx->dims());
 
-    if (input_dims != output_dims) {
-      auto input_dy_md = dnnl::memory::desc(phi::vectorize(input_dy->dims()),
-                                            platform::MKLDNNGetDataType<T>(),
-                                            input_dy->format());
-      auto input_dy_ex_md = input_dy_md.reshape(input_dims);
-      // TODO(jczaja): once MD is stored in Tensor we no longer need to guess
-      // formats
-      x_format_tag = platform::GetMKLDNNFormat(input_dy_ex_md);
+    auto dout_mem_desc = dout->mem_desc();
 
-    } else {
-      // There was no broadcasting then just simple copy is done
-      // same format used for input and output
-      x_format_tag = getPlainFormatTag(output_dx);
+    if (input_dims != output_dims) {
+      dout_mem_desc = dout_mem_desc.reshape(input_dims);
     }
 
-    output_dx->set_format(x_format_tag);
-
     platform::BroadcastDataMKLDNNHandler<T> handler(
-        binary_type, onednn_engine, ctx.GetPlace(), output_dx, input_dy,
-        scale_x, scale_y, input_dims);
+        binary_type, onednn_engine, ctx.GetPlace(), dx, dout, scale_x, scale_y,
+        dout_mem_desc);
 
-    const auto src_memory_p = handler.AcquireSrcMemory(input_dy);
-    const auto dst_memory_p = handler.AcquireDstMemory(output_dx);
+    const auto src_memory_p = handler.AcquireSrcMemory(dout);
+    const auto dst_memory_p = handler.AcquireDstMemory(dx);
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
     const std::unordered_map<int, dnnl::memory> args = {
@@ -170,29 +154,7 @@ class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
     binary_prim->execute(astream, args);
     astream.wait();
 
-    output_dx->set_layout(framework::DataLayout::kMKLDNN);
-  }
-
- protected:
-  dnnl::memory::format_tag getPlainFormatTag(const Tensor* tensor) const {
-    auto tensor_dims_size = tensor->dims().size();
-    PADDLE_ENFORCE_EQ(
-        tensor_dims_size <= 5 && tensor_dims_size >= 1, true,
-        platform::errors::InvalidArgument(
-            "Dims for reduction_grad oneDNN op must be in range <1, 5>"));
-
-    switch (tensor_dims_size) {
-      case 1:
-        return dnnl::memory::format_tag::a;
-      case 2:
-        return dnnl::memory::format_tag::ab;
-      case 3:
-        return dnnl::memory::format_tag::abc;
-      case 4:
-        return dnnl::memory::format_tag::abcd;
-    }
-
-    return dnnl::memory::format_tag::abcde;
+    dx->set_mem_desc(dst_memory_p->get_desc());
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
index 89e578dbdb6b7..6e5fd59c45645 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -12,9 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-#include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/platform/device/mlu/device_context.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
 
 namespace paddle {
 namespace operators {
@@ -23,42 +21,7 @@ template <typename T>
 class ReduceMeanMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto dims = context.Attr<std::vector<int>>("dim");
-    auto input_dims = phi::vectorize(input->dims());
-    const auto& input_dim_size = input->dims().size();
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      for (size_t i = 0; i < input_dims.size(); i++) {
-        reduce_dims.push_back(static_cast<int>(i));
-      }
-    } else {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        if (dims[i] < 0) {
-          reduce_dims.push_back(dims[i] + input_dim_size);
-        } else {
-          reduce_dims.push_back(dims[i]);
-        }
-      }
-    }
-
-    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
-                                 ToCnnlDataType(input->dtype()));
-    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
-                                  ToCnnlDataType(output->dtype()));
-
-    MLUCnnlReduceDesc reduction_desc(
-        reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
-        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
-
-    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
-                    nullptr, input_desc.get(), GetBasePtr(input),
-                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
-                    GetBasePtr(output));
+    MLUReduceOp<T>(context, "reduce_mean");
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
new file mode 100644
index 0000000000000..95dda354cae7d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_mlu.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_MLU
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void MLUReduceOp(const framework::ExecutionContext& context,
+                 std::string reduce_name) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_mlu_place(context.GetPlace()), true,
+      platform::errors::Unavailable("This kernel only runs on MLU."));
+  auto* input = context.Input<Tensor>("X");
+  auto* output = context.Output<Tensor>("Out");
+  output->mutable_data<T>(context.GetPlace());
+
+  bool reduce_all = context.Attr<bool>("reduce_all");
+  auto dims = context.Attr<std::vector<int>>("dim");
+  auto input_dims = phi::vectorize(input->dims());
+  const auto& input_dim_size = input->dims().size();
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    for (size_t i = 0; i < input_dims.size(); i++) {
+      reduce_dims.push_back(static_cast<int>(i));
+    }
+  } else {
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) {
+        reduce_dims.push_back(dims[i] + input_dim_size);
+      } else {
+        reduce_dims.push_back(dims[i]);
+      }
+    }
+  }
+
+  MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                               ToCnnlDataType(input->dtype()));
+  MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                ToCnnlDataType(output->dtype()));
+
+  cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
+  MLUCnnlReduceDesc reduction_desc(reduce_dims, reduce_op, ToCnnlDataType<T>(),
+                                   CNNL_NOT_PROPAGATE_NAN,
+                                   CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+  MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                  nullptr, input_desc.get(), GetBasePtr(input),
+                  0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                  GetBasePtr(output));
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
new file mode 100644
index 0000000000000..fab8bb23b16ac
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_mlu.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceSumMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    MLUReduceOp<T>(context, "reduce_sum");
+  }
+};
+
+template <typename T>
+class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<Tensor>("X");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    in_grad->mutable_data<T>(context.GetPlace());
+
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    auto in_dims = phi::vectorize(in->dims());
+
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < in_dims.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+    for (auto& d : reduce_dims) {
+      if (d < 0) {
+        d = d + in_dims.size();
+      }
+    }
+
+    Tensor tmp_out(out_grad->dtype());
+    auto tmp_output_dims = in_dims;
+    for (auto d : reduce_dims) {
+      tmp_output_dims[d] = 1;
+    }
+    tmp_out.ShareDataWith(*out_grad);
+    tmp_out.Resize(phi::make_ddim(tmp_output_dims));
+
+    MLUCnnlTensorDesc out_desc(tmp_out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+    MLUCnnlTensorDesc in_grad_desc(*in_grad, CNNL_LAYOUT_ARRAY,
+                                   ToCnnlDataType<T>());
+
+    MLUCnnl::BroadcastTo(context, out_desc.get(), GetBasePtr(&tmp_out),
+                         in_grad_desc.get(), GetBasePtr(in_grad));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_sum, ops::ReduceSumMLUKernel<float>,
+                       ops::ReduceSumMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(reduce_sum_grad, ops::ReduceSumGradMLUKernel<float>,
+                       ops::ReduceSumGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index 5b8922505cc08..dc20952903ab2 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -84,10 +84,8 @@ class SplitOp : public framework::OperatorWithKernel {
       // reorders, because if blocked dimension is not divisible by 8 or
       // 16(depending on which blocking format is used) submemory cannot be
       // created, so in that scenario a fallback is needed
-      auto tmp_md = dnnl::memory::desc(
-          phi::vectorize(ctx.Input<Tensor>("X")->dims()),
-          dnnl::memory::data_type::f32, ctx.Input<Tensor>("X")->format());
-      if (tmp_md.data.format_desc.blocking.inner_nblks == 0)
+      const auto x_md = ctx.Input<Tensor>("X")->mem_desc();
+      if (x_md.data.format_desc.blocking.inner_nblks == 0)
         return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                        framework::DataLayout::kMKLDNN,
                                        framework::LibraryType::kMKLDNN);
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
index dff5c2d3f3937..04e4d88b008e0 100644
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -89,7 +89,9 @@ class TopkV2NPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(top_k_v2, ops::TopkV2NPUKernel<float>,
+                       ops::TopkV2NPUKernel<plat::float16>,
                        ops::TopkV2NPUKernel<double>,
                        ops::TopkV2NPUKernel<int32_t>,
                        ops::TopkV2NPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc
index a44ea8ff689b8..70200fe733a5a 100644
--- a/paddle/fluid/operators/tril_triu_op_xpu.cc
+++ b/paddle/fluid/operators/tril_triu_op_xpu.cc
@@ -43,6 +43,34 @@ class TrilTriuXPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class TrilTriuGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const auto* dout_data = d_out->data<T>();
+    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dx_data = d_x->mutable_data<T>(context.GetPlace());
+
+    const int diagonal = context.Attr<int>("diagonal");
+    const bool lower = context.Attr<bool>("lower");
+
+    auto dy_shape = phi::vectorize<int>(d_out->dims());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    int r = 0;
+    if (lower) {
+      r = xpu::tril(dev_ctx.x_context(), dout_data, dx_data, dy_shape,
+                    diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op");
+    } else {
+      r = xpu::triu(dev_ctx.x_context(), dout_data, dx_data, dy_shape,
+                    diagonal);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op");
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -50,4 +78,8 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     tril_triu, ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, int>,
     ops::TrilTriuXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    tril_triu_grad,
+    ops::TrilTriuGradXPUKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::TrilTriuGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/unsqueeze_op_mlu.cc b/paddle/fluid/operators/unsqueeze_op_mlu.cc
new file mode 100644
index 0000000000000..9f3704fbb7a30
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_mlu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_MLU
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/unsqueeze_op.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(
+    unsqueeze, ops::UnsqueezeKernel<plat::MLUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, int64_t>);
+REGISTER_OP_MLU_KERNEL(
+    unsqueeze2, ops::UnsqueezeKernel<plat::MLUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::MLUDeviceContext, int64_t>);
+REGISTER_OP_MLU_KERNEL(
+    unsqueeze_grad, ops::UnsqueezeGradKernel<plat::MLUDeviceContext, float>,
+    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, double>,
+    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, bool>,
+    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int>,
+    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int8_t>,
+    ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int64_t>);
+REGISTER_OP_MLU_KERNEL(
+    unsqueeze2_grad, ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, float>,
+    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, double>,
+    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, bool>,
+    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int>,
+    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int8_t>,
+    ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/platform/device/mlu/mlu_stream.h b/paddle/fluid/platform/device/mlu/mlu_stream.h
index 3f4b27e370f2e..b20949f3bfe85 100644
--- a/paddle/fluid/platform/device/mlu/mlu_stream.h
+++ b/paddle/fluid/platform/device/mlu/mlu_stream.h
@@ -40,7 +40,6 @@ class MLUStream final {
 
   template <typename Callback>
   void AddCallback(Callback&& callback) const {
-    // TODO(mlu): mlu not support AddCallback
     callback_manager_->AddCallback(callback);
   }
 
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 6f4826bd8c39a..357644b62d3ed 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -43,6 +43,8 @@ XPUOpMap& get_kl2_ops() {
       {"batch_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bmm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bmm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"bce_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"bce_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -380,6 +382,9 @@ XPUOpMap& get_kl2_ops() {
                              pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tril_triu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                                   pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"tril_triu_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace())})},
       {"tile", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
                              pOpKernelType(vartype::INT64, XPUPlace()),
                              pOpKernelType(vartype::BOOL, XPUPlace()),
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 9afde00a98be8..99a1eb97de50a 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -97,6 +97,14 @@ XPUOpMap& get_kp_ops() {
        XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
       {"equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
+      // reduce op
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_min", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index 39eefab774dbe..bef551078b332 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 #define INT_BITS 32
+#if defined(__xpu__)
+#define __forceinline__ __inline__
+#endif
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index a43eaa41cfe83..f89452853b49b 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -751,6 +751,32 @@ PADDLE_DEFINE_EXPORTED_string(allow_cinn_ops, "",
  */
 PADDLE_DEFINE_EXPORTED_string(deny_cinn_ops, "",
                               "It controls the cinn op subset to be not used.");
+
+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_pe_launch_cinn
+ * Since Version: 2.3
+ * Value Range: bool, default=true
+ * Example: FLAGS_enable_pe_launch_cinn=true would execute the CINN compiled
+ * instructions of a paddle graph with ParallelExecutor, otherwise with the
+ * CINN compiled runtime program in sequential order.
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_pe_launch_cinn, true,
+                            "It controls whether to execute cinn compiled "
+                            "program with ParallelExecutor");
+
+/*
+ * CINN related FLAG
+ * Name: FLAGS_enable_cinn_auto_tune
+ * Since Version: 2.3
+ * Value Range: bool, default=false
+ * Example: FLAGS_enable_cinn_auto_tune=true would use CINN with its
+ * auto-tune feature enabled
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune, false,
+                            "It controls whether to use cinn with "
+                            "its auto-tune feature enabled");
+
 #endif
 
 DEFINE_int32(record_pool_max_size, 2000000,
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 1254331835bbd..12fa933701ef4 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -625,20 +625,12 @@ class BinaryMKLDNNHandler
         platform::errors::InvalidArgument(
             "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d",
             DataLayout::kMKLDNN, x->layout()));
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Wrong format set for X tensor : %d (undef)",
-                          static_cast<unsigned int>(x->format())));
 
     PADDLE_ENFORCE_EQ(
         y->layout(), DataLayout::kMKLDNN,
         platform::errors::InvalidArgument(
             "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d",
             DataLayout::kMKLDNN, y->layout()));
-    PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Wrong format set for Y tensor : %d (undef)",
-                          static_cast<unsigned int>(y->format())));
 
     const auto src_x_tz = phi::vectorize(x->dims());
     const auto src_y_tz = phi::vectorize(y->dims());
@@ -648,10 +640,8 @@ class BinaryMKLDNNHandler
     const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
                                        : phi::vectorize(z->dims());
 
-    auto src0_md = dnnl::memory::desc(
-        src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
-    auto src1_md = dnnl::memory::desc(
-        src_y_tz, platform::MKLDNNGetDataType<T>(), y->format());
+    auto src0_md = x->mem_desc();
+    auto src1_md = y->mem_desc();
     if (rankdiff > 0) {  // Second input is of smaller rank than first
       std::vector<int64_t> dims1_ex(rankdiff, 1);
       dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)),
@@ -730,21 +720,19 @@ class BroadcastDataMKLDNNHandler
                              const dnnl::engine engine,
                              platform::Place cpu_place, const Tensor* out,
                              const Tensor* x, float scale_x, float scale_y,
-                             const std::vector<int64_t>& input_dims)
+                             const dnnl::memory::desc& x_mem_desc)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     PADDLE_ENFORCE_EQ(
         x->layout(), DataLayout::kMKLDNN,
         platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-    PADDLE_ENFORCE_NE(
-        x->format(), MKLDNNMemoryFormat::undef,
-        platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
     const auto src0_tz = phi::vectorize(out->dims());
 
-    const auto src0_md = dnnl::memory::desc(
-        src0_tz, platform::MKLDNNGetDataType<T>(), out->format());
-    const auto src1_md = dnnl::memory::desc(
-        input_dims, platform::MKLDNNGetDataType<T>(), out->format());
+    const auto src0_md =
+        dnnl::memory::desc(src0_tz, platform::MKLDNNGetDataType<T>(),
+                           platform::GetPlainMKLDNNFormat(src0_tz.size()));
+
+    const auto src1_md = x_mem_desc;
 
     dnnl::primitive_attr attributes;
     attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
@@ -777,21 +765,16 @@ class ReductionMKLDNNHandler
     PADDLE_ENFORCE_EQ(
         x->layout(), DataLayout::kMKLDNN,
         platform::errors::InvalidArgument("Wrong layout set for X tensor."));
-    PADDLE_ENFORCE_NE(
-        x->format(), MKLDNNMemoryFormat::undef,
-        platform::errors::InvalidArgument("Wrong format set for X tensor."));
-
-    const auto x_tz = phi::vectorize(x->dims());
 
-    const auto x_md =
-        dnnl::memory::desc(x_tz, platform::MKLDNNGetDataType<T>(), x->format());
-    const auto y_md =
-        memory::desc(y_tz, platform::MKLDNNGetDataType<T>(), x->format());
+    const auto y_md = memory::desc(y_tz, platform::MKLDNNGetDataType<T>(),
+                                   dnnl::memory::format_tag::any);
 
     if (attr)
-      this->AcquireForwardPrimitiveDescriptor(attr, algo, x_md, y_md, p, eps);
+      this->AcquireForwardPrimitiveDescriptor(attr, algo, x->mem_desc(), y_md,
+                                              p, eps);
     else
-      this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
+      this->AcquireForwardPrimitiveDescriptor(algo, x->mem_desc(), y_md, p,
+                                              eps);
   }
 };
 
@@ -911,7 +894,7 @@ class ActivationMKLDNNHandler
   ActivationMKLDNNHandler(dnnl::algorithm algorithm,
                           const framework::ExecutionContext& ctx,
                           const dnnl::engine engine, Place cpu_place,
-                          const framework::Tensor* in_x)
+                          const framework::Tensor* x)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::eltwise_forward,
                                           dnnl::eltwise_backward>(engine,
                                                                   cpu_place) {
@@ -946,25 +929,15 @@ class ActivationMKLDNNHandler
       }
     }
 
-    PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6,
-                   platform::errors::Unimplemented(
-                       "Input dimension size can be 1, 2, 3, 4, "
-                       "5, or 6, but now the dimension size is",
-                       in_x->dims().size()));
-
-    auto src_tz = phi::vectorize<int64_t>(in_x->dims());
-    auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format();
-    auto md =
-        dnnl::memory::desc(src_tz, platform::MKLDNNGetDataType<T>(), src_fmt);
-
     this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
-                                            algorithm, md, alpha, beta);
+                                            algorithm, x->mem_desc(), alpha,
+                                            beta);
   }
 
   ActivationMKLDNNHandler(dnnl::algorithm algorithm,
                           const framework::ExecutionContext& ctx,
                           const dnnl::engine engine, Place cpu_place,
-                          const framework::Tensor* in_x, const Tensor* out_grad)
+                          const framework::Tensor* x, const Tensor* dout)
       : platform::MKLDNNHandlerNoCachingT<T, dnnl::eltwise_forward,
                                           dnnl::eltwise_backward>(engine,
                                                                   cpu_place) {
@@ -985,23 +958,11 @@ class ActivationMKLDNNHandler
                                  : ctx.Attr<float>("max");
     }
 
-    auto diff_dst_tz = phi::vectorize<int64_t>(out_grad->dims());
-
-    auto src_fmt =
-        diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format();
-    auto diff_fmt =
-        diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format();
-
-    auto dims = phi::vectorize(in_x->dims());
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dims, platform::MKLDNNGetDataType<T>(), diff_fmt);
-    auto src_md = platform::MKLDNNMemDesc(
-        dims, platform::MKLDNNGetDataType<T>(), src_fmt);
-
     this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
-                                            algorithm, src_md, alpha, beta);
-    this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md,
-                                             alpha, beta);
+                                            algorithm, x->mem_desc(), alpha,
+                                            beta);
+    this->AcquireBackwardPrimitiveDescriptor(algorithm, dout->mem_desc(),
+                                             x->mem_desc(), alpha, beta);
   }
 
   std::shared_ptr<dnnl::memory> AcquireBackwardSrcMemory(
@@ -1036,6 +997,11 @@ class ReorderMKLDNNHandler {
         dtype_dst_(dtype_dst),
         engine_(engine) {}
 
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const dnnl::memory::desc& md,
+                                                 void* ptr) {
+    return std::make_shared<dnnl::memory>(md, engine_, ptr);
+  }
+
   std::shared_ptr<dnnl::memory> AcquireSrcMemory(const MKLDNNMemoryFormat& fmt,
                                                  void* ptr) {
     auto md = dnnl::memory::desc(dims_, dtype_, fmt);
@@ -1060,6 +1026,22 @@ class ReorderMKLDNNHandler {
     return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
   }
 
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(
+      framework::Tensor* output, const dnnl::memory::desc& src_md,
+      platform::Place place) {
+    if (vtype_dst_ == vtype_) {
+      auto dst_data = output->mutable_data(
+          place, framework::TransToPhiDataType(vtype_dst_), src_md.get_size());
+      return std::make_shared<dnnl::memory>(src_md, engine_, dst_data);
+    } else {
+      auto dst_md = src_md;
+      dst_md.data.data_type = static_cast<dnnl_data_type_t>(dtype_dst_);
+      auto dst_data = output->mutable_data(
+          place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size());
+      return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
+    }
+  }
+
   std::shared_ptr<dnnl::memory> AcquireDstMemory(
       framework::Tensor* output, const std::vector<int64_t>& dims,
       const MKLDNNMemoryFormat& fmt, platform::Place place) {
diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h
new file mode 100644
index 0000000000000..12c48ed412428
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn_utils.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// NOTE:
+// GetMKLDNNFormat function is here temporarily. It is
+// needed because without them forward declaration was causing an error when
+// building with "-DWITH_TESTING=ON". This file will be deleted after completing
+// md-related refactoring
+
+namespace paddle {
+namespace platform {
+
+inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) {
+  auto ndims = mem_desc.data.ndims;
+  auto strides = mem_desc.data.format_desc.blocking.strides;
+  auto inner_nblks = mem_desc.data.format_desc.blocking.inner_nblks;
+  auto inner_blks = mem_desc.data.format_desc.blocking.inner_blks;
+  auto inner_idxs = mem_desc.data.format_desc.blocking.inner_idxs;
+
+  if (ndims == 1) {
+    return dnnl::memory::format_tag::x;
+  } else if (ndims == 2) {
+    if (inner_nblks == 0) {
+      if (strides[0] >= strides[1]) {
+        return dnnl::memory::format_tag::nc;
+      } else {
+        return dnnl::memory::format_tag::cn;
+      }
+    }
+  } else if (ndims == 3) {
+    if (inner_nblks == 0) {
+      if (strides[0] >= strides[1] && strides[1] >= strides[2]) {
+        return dnnl::memory::format_tag::ncw;
+      } else if (strides[1] >= strides[0] && strides[0] >= strides[2]) {
+        return dnnl::memory::format_tag::ntc;
+      } else {
+        return dnnl::memory::format_tag::nwc;
+      }
+    }
+  } else if (ndims == 4) {
+    if (inner_nblks == 0) {
+      if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+          strides[2] >= strides[3]) {
+        return dnnl::memory::format_tag::nchw;
+      } else if (strides[2] >= strides[3] && strides[3] >= strides[1] &&
+                 strides[1] >= strides[0]) {
+        return dnnl::memory::format_tag::cdba;
+      } else if (strides[3] >= strides[2] && strides[2] >= strides[0] &&
+                 strides[0] >= strides[1]) {
+        return dnnl::memory::format_tag::dcab;
+      } else {
+        return dnnl::memory::format_tag::nhwc;
+      }
+    } else if (inner_nblks == 1) {
+      if (inner_blks[0] == 16 && inner_idxs[0] == 1) {
+        return dnnl::memory::format_tag::nChw16c;
+      } else if (inner_blks[0] == 8 && inner_idxs[0] == 1) {
+        return dnnl::memory::format_tag::nChw8c;
+      } else if (inner_blks[0] == 8 && inner_idxs[0] == 0) {
+        if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+            strides[3] >= strides[1]) {
+          return dnnl::memory::format_tag::Acdb8a;
+        }
+      } else if (inner_blks[0] == 4 && inner_idxs[0] == 1) {
+        return dnnl::memory::format_tag::nChw4c;
+      } else if (inner_blks[0] == 16 && inner_idxs[0] == 0) {
+        if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+            strides[3] >= strides[1]) {
+          return dnnl::memory::format_tag::Acdb16a;
+        }
+      }
+    } else if (inner_nblks == 2) {
+      if (inner_blks[0] == 16 && inner_blks[1] == 16) {
+        if (inner_idxs[0] == 1 && inner_idxs[1] == 0) {
+          return dnnl::memory::format_tag::OIhw16i16o;
+        }
+      } else if (inner_blks[0] == 8 && inner_blks[1] == 8) {
+        if (inner_idxs[0] == 1 && inner_idxs[1] == 0) {
+          return dnnl::memory::format_tag::OIhw8i8o;
+        }
+      }
+    }
+  } else if (ndims == 5) {
+    if (inner_nblks == 0) {
+      if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+          strides[2] >= strides[3] && strides[3] >= strides[4]) {
+        return dnnl::memory::format_tag::abcde;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[1] &&
+                 strides[1] >= strides[3] && strides[3] >= strides[4]) {
+        return dnnl::memory::format_tag::acbde;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+                 strides[3] >= strides[4] && strides[4] >= strides[1]) {
+        return dnnl::memory::format_tag::acdeb;
+      }
+    } else if (inner_nblks == 1) {
+      if (inner_blks[0] == 8 && inner_idxs[0] == 0) {
+        if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+            strides[3] >= strides[4] && strides[4] >= strides[1]) {
+          return dnnl::memory::format_tag::Acdeb8a;
+        }
+        if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+            strides[2] >= strides[3] && strides[3] >= strides[4]) {
+          return dnnl::memory::format_tag::Abcde8a;
+        }
+      } else if (inner_blks[0] == 8 && inner_idxs[0] == 1) {
+        if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+            strides[2] >= strides[3] && strides[3] >= strides[4]) {
+          return dnnl::memory::format_tag::aBcde8b;
+        }
+      } else if (inner_blks[0] == 16 && inner_idxs[0] == 0) {
+        if (strides[0] >= strides[2] && strides[2] >= strides[3] &&
+            strides[3] >= strides[4] && strides[4] >= strides[1]) {
+          return dnnl::memory::format_tag::Acdeb16a;
+        }
+        if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+            strides[2] >= strides[3] && strides[3] >= strides[4]) {
+          return dnnl::memory::format_tag::Abcde16a;
+        }
+      } else if (inner_blks[0] == 16 && inner_idxs[0] == 1) {
+        if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+            strides[2] >= strides[3] && strides[3] >= strides[4]) {
+          return dnnl::memory::format_tag::aBcde16b;
+        }
+      }
+    }
+  } else if (ndims == 6) {
+    if (inner_nblks == 0) {
+      if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+          strides[2] >= strides[3] && strides[3] >= strides[4] &&
+          strides[4] >= strides[5]) {
+        return dnnl::memory::format_tag::abcdef;
+      } else if (strides[0] >= strides[2] && strides[2] >= strides[1] &&
+                 strides[1] >= strides[3] && strides[3] >= strides[4] &&
+                 strides[4] >= strides[5]) {
+        return dnnl::memory::format_tag::acbdef;
+      }
+    }
+  }
+  // DEBUG CODE - KEEP UNTILL TENSOR.MEMORY_DESC IMPLEMENTED
+  // std::cout<<"@@@@@@@@@@ UNDEFINED FORMAT @@@@@@@@@@@@@@@@@@@"<<std::endl;
+  // std::cout<<"NDIMS: "<<ndims<<std::endl;
+  // std::cout<<"INNER_NBLKS: "<<inner_nblks<<std::endl;
+  // for (int i=0;i<ndims;++i) {
+  //   std::cout<<"STRIDE["<<i<<"]: "<<strides[i]<<std::endl;
+  // }
+  // for (int i=0;i<inner_nblks;++i) {
+  //   std::cout<<"INNER_BLKS["<<i<<"]: "<<inner_blks[i]<<std::endl;
+  // }
+  // for (int i=0;i<inner_nblks;++i) {
+  //   std::cout<<"INNER_IDXS["<<i<<"]: "<<inner_idxs[i]<<std::endl;
+  // }
+  return dnnl::memory::format_tag::undef;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
index eceb5fabe8dba..36abf77279d06 100644
--- a/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
+++ b/paddle/fluid/platform/profiler/mlu/cnpapi_data_process.cc
@@ -202,6 +202,7 @@ CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
   REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
   REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
   REGISTER_RUNTIME_CBID_STR(cnCtxSync);
+  REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
 #undef REGISTER_RUNTIME_CBID_STR
 }
 
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index c9e6f13f50524..24c515f5b4956 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -34,6 +34,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/enforce.h"
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#endif
 
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -135,6 +139,13 @@ void SynchronizeAllDevice() {
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  int count = GetMLUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetMLUDeviceId(i);
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
+  }
+#endif
 }
 
 static double ToMegaBytes(size_t bytes) {
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 7148afee273fd..6fa326d57bc67 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -80,10 +80,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_MLU
-  VLOG(3) << "MLULaunchCallback at stream: " << stream_
-          << " Failed to call MLULaunchCallback, "
-          << "because mlu not support StreamAddCallback yet. "
-          << "function: " << func;
+  VLOG(3) << "MLULaunchCallback at stream: " << stream_;
+  cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
 #endif
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b0ebe5026b5d4..9c509bbd2c455 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -181,10 +181,9 @@ if(WITH_PYTHON)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
   add_executable(eager_op_function_generator eager_op_function_generator.cc)
   target_link_libraries(eager_op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
-  add_executable(kernel_signature_generator kernel_signature_generator.cc)
-  target_link_libraries(kernel_signature_generator ${OP_FUNCTION_GENERETOR_DEPS})
-  if(WIN32)
-    target_link_libraries(kernel_signature_generator shlwapi.lib)
+  if(NOT WIN32)
+    add_executable(kernel_signature_generator kernel_signature_generator.cc)
+    target_link_libraries(kernel_signature_generator ${OP_FUNCTION_GENERETOR_DEPS})
   endif()
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 716cd35f0a614..ab8bf0529dcfc 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -241,49 +241,42 @@ void BindDistributed(py::module *m) {
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
       .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
-                    int>(),
+                    const platform::CUDAPlace &, int>(),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("group_id") = 0, py::call_guard<py::gil_scoped_release>());
+           py::arg("place"), py::arg("group_id") = 0,
+           py::call_guard<py::gil_scoped_release>());
+#endif
 
 #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
     (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL))
   py::class_<distributed::ProcessGroupHeter,
              std::shared_ptr<distributed::ProcessGroupHeter>>(
       *m, "ProcessGroupHeter", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int, int,
-                    int, int, int, int, bool, std::string>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
+#if defined(PADDLE_WITH_ASCEND_CL)
+                    const platform::NPUPlace &,
+#else
+                    const platform::CUDAPlace &,
+#endif
+                    int, int, int, int, int, bool, std::string>(),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("gid") = 0, py::arg("local_rank") = 0,
+           py::arg("place"), py::arg("gid") = 0, py::arg("local_rank") = 0,
            py::arg("local_size") = 1, py::arg("gloo_rank") = 0,
            py::arg("gloo_size") = 1, py::arg("with_switch") = false,
            py::arg("switch_endpoint") = "",
            py::call_guard<py::gil_scoped_release>());
 #endif
-#endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
   py::class_<distributed::ProcessGroupHCCL,
              std::shared_ptr<distributed::ProcessGroupHCCL>>(
       *m, "ProcessGroupHCCL", ProcessGroup)
       .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
-                    int>(),
-           py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("group_id") = 0, py::call_guard<py::gil_scoped_release>());
-
-#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_ASCEND_CL))
-  py::class_<distributed::ProcessGroupHeter,
-             std::shared_ptr<distributed::ProcessGroupHeter>>(
-      *m, "ProcessGroupHeter", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int, int,
-                    int, int, int, int, bool, std::string>(),
+                    const platform::NPUPlace &, int>(),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("gid") = 0, py::arg("local_rank") = 0,
-           py::arg("local_size") = 1, py::arg("gloo_rank") = 0,
-           py::arg("gloo_rank") = 1, py::arg("with_switch") = false,
-           py::arg("switch_endpoint") = "",
+           py::arg("place"), py::arg("group_id") = 0,
            py::call_guard<py::gil_scoped_release>());
-#endif
+
 #endif
 
   py::class_<distributed::ProcessGroup::Task,
@@ -299,10 +292,12 @@ void BindDistributed(py::module *m) {
   py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
       *m, "ProcessGroupGloo", ProcessGroup)
       .def(py::init<const std::shared_ptr<paddle::distributed::Store> &, int,
-                    int, int, std::shared_ptr<GlooOptions> &>(),
+                    int, const platform::CPUPlace &, int,
+                    std::shared_ptr<GlooOptions> &>(),
            py::call_guard<py::gil_scoped_release>())
       .def(py::init([](const std::shared_ptr<paddle::distributed::Store> &store,
-                       int rank, int world_size, int gid) {
+                       int rank, int world_size,
+                       const platform::CPUPlace &place, int gid) {
              auto opts = GlooOptions::create();
              char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
              if (ifname && strlen(ifname) > 1) {
@@ -312,10 +307,11 @@ void BindDistributed(py::module *m) {
                opts->device = ProcessGroupGloo::createDefaultDevice();
              }
              return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
-                                                       gid, opts);
+                                                       place, gid, opts);
            }),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("group_id") = 0, py::call_guard<py::gil_scoped_release>())
+           py::arg("place"), py::arg("group_id") = 0,
+           py::call_guard<py::gil_scoped_release>())
       .def_static("create_default_device",
                   &ProcessGroupGloo::createDefaultDevice);
 #endif
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index fa66e55e9c53a..8695928205bb0 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -62,7 +62,7 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
                             bool persistable = false, int stop_gradient = -1,
                             framework::proto::VarType::Type dtype =
                                 paddle::framework::proto::VarType::FP32,
-                            const std::vector<int>& dims = {},
+                            const std::vector<int>& dims = {0},
                             framework::proto::VarType::Type var_type =
                                 paddle::framework::proto::VarType::LOD_TENSOR) {
   auto ddims = phi::make_ddim(dims);
@@ -75,7 +75,7 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
   if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) {
     // TODO(jiabin): Maybe support LOD later
     std::shared_ptr<phi::DenseTensor> dense_tensor = nullptr;
-    if (dims.empty()) {
+    if (dims.size() == 1 && dims[0] == 0) {
       std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
       dense_tensor = std::make_shared<phi::DenseTensor>(
           allocation_ptr,
diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/eager_custom_python_api.h
similarity index 59%
rename from paddle/fluid/pybind/custom_handwrite_op_funcs.h
rename to paddle/fluid/pybind/eager_custom_python_api.h
index 044c3d5d176e1..c509ab5674930 100644
--- a/paddle/fluid/pybind/custom_handwrite_op_funcs.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -54,9 +54,53 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args,
   }
 }
 
+static PyObject *eager_api_final_state_linear(PyObject *self, PyObject *args,
+                                              PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto x = GetTensorFromArgs("linear", "X", args, 0, false);
+    auto weight = GetTensorFromArgs("linear", "weight", args, 1, false);
+    auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
+    tstate = PyEval_SaveThread();
+    if (bias.initialized()) {
+      auto mm_out =
+          matmul_final_state_dygraph_function(x, weight, false, false);
+      auto out = add_final_state_dygraph_function(bias, mm_out);
+      PyEval_RestoreThread(tstate);
+      tstate = nullptr;
+      return ToPyObject(out);
+    } else {
+      auto mm_out =
+          matmul_final_state_dygraph_function(x, weight, false, false);
+      PyEval_RestoreThread(tstate);
+      tstate = nullptr;
+      return ToPyObject(mm_out);
+    }
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    std::ostringstream sout;
+    sout << exception.what();
+    sout << "  [operator < linear > error]";
+    exception.set_error_str(sout.str());
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
 static PyMethodDef CustomEagerFinalStateMethods[] = {
     {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for run_program in dygraph."},
-
+    {"final_state_linear",
+     (PyCFunction)(void (*)(void))eager_api_final_state_linear,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
     {nullptr, nullptr, 0, nullptr}};
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 542d59318bbad..13fba2baa1d6c 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -41,6 +41,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "pybind11/detail/internals.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -194,6 +195,17 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
       nullptr);
 
   if (!self->tensor.impl()->initialized()) {
+    if (tensor_dims.size() == 0) {
+      py_dims[0] = 0;
+      py_strides[0] = 0;
+      PyObject* array = api.PyArray_NewFromDescr_(
+          api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype), 1,
+          py_dims, py_strides, nullptr,
+          pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
+              pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
+          nullptr);
+      return array;
+    }
     return array;
   }
 
@@ -713,15 +725,19 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
         break;
       }
     }
+    std::vector<int64_t> slice_axes_tmp(slice_axes.begin(), slice_axes.end());
+    std::vector<int64_t> infer_flags_tmp(infer_flags.begin(),
+                                         infer_flags.end());
+    std::vector<int64_t> decrease_axis_tmp(decrease_axis.begin(),
+                                           decrease_axis.end());
+
     if (op_type == "slice") {
-      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
-                                   paddle::experimental::Tensor(), {}, {},
-                                   std::move(attrs));
+      out = slice_final_state_dygraph_function(
+          self->tensor, slice_axes_tmp, slice_starts, slice_ends,
+          infer_flags_tmp, decrease_axis_tmp);
     } else if (op_type == "strided_slice") {
-      out = strided_slice_dygraph_function(
-          self->tensor, paddle::experimental::Tensor(),
-          paddle::experimental::Tensor(), paddle::experimental::Tensor(), {},
-          {}, {}, attrs);
+      out = strided_slice_final_state_dygraph_function(
+          self->tensor, slice_axes, slice_starts, slice_ends, slice_strides);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Slice is only support slice and strided_slice, but we got %s which "
@@ -776,8 +792,8 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
                                         idx_tensor.get());
     framework::AttributeMap attrs = {{"dim", 0}};
-    out = index_select_dygraph_function(self->tensor, select_index,
-                                        std::move(attrs));
+    out = index_select_final_state_dygraph_function(self->tensor, select_index,
+                                                    0);
   }
 
   return ToPyObject(out);
@@ -1476,6 +1492,46 @@ static PyObject* tensor__offset(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor__grad_name(TensorObject* self, PyObject* args,
+                                   PyObject* kwargs) {
+  EAGER_TRY
+  paddle::experimental::Tensor* grad =
+      egr::EagerUtils::mutable_grad(self->tensor);
+  PADDLE_ENFORCE_EQ(grad != nullptr, true,
+                    platform::errors::InvalidArgument(
+                        "Detected NULL grad. Please check if you have manually "
+                        "cleared the grad inside autograd_meta"));
+  return ToPyObject(grad->name());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* tensor__grad_value(TensorObject* self, PyObject* args,
+                                    PyObject* kwargs) {
+  EAGER_TRY
+  paddle::experimental::Tensor* grad =
+      egr::EagerUtils::mutable_grad(self->tensor);
+  PADDLE_ENFORCE_EQ(grad != nullptr, true,
+                    platform::errors::InvalidArgument(
+                        "Detected NULL grad. Please check if you have manually "
+                        "cleared the grad inside autograd_meta"));
+
+  if (!grad->defined()) {
+    Py_IncRef(Py_None);
+    return Py_None;
+  }
+  if (grad->is_dense_tensor()) {
+    auto* grad_tensor =
+        static_cast<paddle::framework::LoDTensor*>(grad->impl().get());
+    return ToPyObject(grad_tensor);
+  } else {
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "this method is only supported for DenseTensor"));
+    Py_IncRef(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 #if defined(PADDLE_WITH_CUDA)
 static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
                                     PyObject* kwargs) {
@@ -1617,6 +1673,10 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_offset", (PyCFunction)(void (*)(void))tensor__offset,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_grad_name", (PyCFunction)(void (*)(void))tensor__grad_name,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_grad_value", (PyCFunction)(void (*)(void))tensor__grad_value,
+     METH_VARARGS | METH_KEYWORDS, NULL},
 #if defined(PADDLE_WITH_CUDA)
     {"_tensor_uva", (PyCFunction)(void (*)(void))tensor_method__uva,
      METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc
index 06d88be9bc8cc..2ac12165c1a66 100644
--- a/paddle/fluid/pybind/eager_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_op_function_generator.cc
@@ -134,7 +134,7 @@ const char* PYBIND_ITEM_TEMPLATE = R"(  {"%s", (PyCFunction)(void(*)(void))%s, M
 // need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE
 std::unordered_set<std::string> CUSTOM_HANDWRITE_OPS_SET = {"run_program"};
 const char* CUSTOM_HANDWRITE_OP_FUNC_FILE =
-  "#include \"paddle/fluid/pybind/custom_handwrite_op_funcs.h\"\n";
+  "#include \"paddle/fluid/pybind/eager_custom_python_api.h\"\n";
 
 // clang-format on
 static inline bool FindInsMap(const std::string& op_type,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 8fa21ef45f82f..9719963d51da0 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1013,6 +1013,32 @@ paddle::experimental::Tensor& GetTensorFromPyObject(PyObject* obj) {
   return reinterpret_cast<TensorObject*>(obj)->tensor;
 }
 
+paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos) {
+  PyTypeObject* type = obj->ob_type;
+  auto type_name = std::string(type->tp_name);
+  VLOG(1) << "type_name: " << type_name;
+  if (type_name == "numpy.float64") {
+    double value = CastPyArg2Double(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (type_name == "numpy.float32") {
+    float value = CastPyArg2Float(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (type_name == "numpy.int64") {
+    int64_t value = CastPyArg2Long(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (type_name == "numpy.int32") {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "numpy.float32/float64, numpy.int32/int64, but got %s",
+        op_type, arg_pos + 1, type_name));  // NOLINT
+  }
+}
+
 paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos) {
@@ -1027,22 +1053,25 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
   // obj could be: int, float, bool, paddle.Tensor
   PyTypeObject* type = obj->ob_type;
   auto type_name = std::string(type->tp_name);
-  if (type_name == "int") {
+  VLOG(1) << "type_name: " << type_name;
+  if (PyBool_Check(obj)) {
+    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
+  } else if (PyLong_Check(obj)) {
     int value = CastPyArg2Int(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
-  } else if (type_name == "float") {
+  } else if (PyFloat_Check(obj)) {
     float value = CastPyArg2Float(obj, op_type, arg_pos);
     return paddle::experimental::Scalar(value);
-
-  } else if (type_name == "bool") {
-    bool value = CastPyArg2Boolean(obj, op_type, arg_pos);
-    return paddle::experimental::Scalar(value);
-
-  } else if (type_name == "Tensor") {
+  } else if (IsEagerTensor(obj)) {
     paddle::experimental::Tensor& value = GetTensorFromPyObject(
         op_type, "" /*arg_name*/, obj, arg_pos, false /*dispensable*/);
     return paddle::experimental::Scalar(value);
-
+  } else if (type_name.find("numpy") != std::string::npos) {
+    return CastNumpy2Scalar(obj, op_type, arg_pos);
+  } else if (PyObject_CheckLongOrToLong(&obj)) {
+    int value = CastPyArg2Int(obj, op_type, arg_pos);
+    return paddle::experimental::Scalar(value);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
@@ -1072,7 +1101,8 @@ paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj,
   // obj could be: int, float, bool, paddle.Tensor
   PyTypeObject* type = obj->ob_type;
   auto type_name = std::string(type->tp_name);
-  if (type_name == "list" || type_name == "tuple") {
+  if (type_name == "list" || type_name == "tuple" ||
+      type_name == "numpy.ndarray") {
     std::vector<int> value = CastPyArg2Ints(obj, op_type, arg_pos);
     return paddle::experimental::IntArray(value);
 
@@ -1151,21 +1181,15 @@ std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
   return result;
 }
 
-paddle::experimental::Place CastPyArg2Place(PyObject* obj,
-                                            const std::string& op_type,
-                                            ssize_t arg_pos) {
+paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
+                              ssize_t arg_pos) {
   return CastPyArg2Place(obj, arg_pos);
 }
 
-paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
-                                                  const std::string& op_type,
-                                                  ssize_t arg_pos) {
+paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
+                                    ssize_t arg_pos) {
   if (obj == Py_None) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "data_type, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+    return paddle::experimental::DataType::UNDEFINED;
   }
 
   framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos);
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 90c4d727923d0..22c41073c9dd7 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -158,17 +158,19 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
                                               const std::string& op_type,
                                               ssize_t arg_pos);
 
+paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
+                                              const std::string& op_type,
+                                              ssize_t arg_pos);
+
 paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj,
                                                   const std::string& op_type,
                                                   ssize_t arg_pos);
 
-paddle::experimental::Place CastPyArg2Place(PyObject* obj,
-                                            const std::string& op_type,
-                                            ssize_t arg_pos);
+paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
+                              ssize_t arg_pos);
 
-paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
-                                                  const std::string& op_type,
-                                                  ssize_t arg_pos);
+paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
+                                    ssize_t arg_pos);
 
 paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 614689b148cda..50e0daf8508e3 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -174,8 +174,13 @@ void CastPyArg2AttrLong(PyObject* obj,
 
 float CastPyArg2Float(PyObject* obj, const std::string& op_type,
                       ssize_t arg_pos) {
+  return static_cast<float>(CastPyArg2Double(obj, op_type, arg_pos));
+}
+
+double CastPyArg2Double(PyObject* obj, const std::string& op_type,
+                        ssize_t arg_pos) {
   if (PyObject_CheckFloatOrToFloat(&obj)) {
-    return (float)PyFloat_AsDouble(obj);  // NOLINT
+    return PyFloat_AsDouble(obj);  // NOLINT
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index 33d0e242a027d..debaf8fae17b7 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -50,6 +50,8 @@ int64_t CastPyArg2Long(PyObject* obj, const std::string& op_type,
                        ssize_t arg_pos);
 float CastPyArg2Float(PyObject* obj, const std::string& op_type,
                       ssize_t arg_pos);
+double CastPyArg2Double(PyObject* obj, const std::string& op_type,
+                        ssize_t arg_pos);
 std::string CastPyArg2String(PyObject* obj, const std::string& op_type,
                              ssize_t arg_pos);
 std::vector<bool> CastPyArg2Booleans(PyObject* obj, const std::string& op_type,
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 42703fc17bde5..2f07a4a40a922 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 
@@ -39,8 +40,10 @@ void BindPSGPUWrapper(py::module* m) {
       .def(py::init([]() { return framework::PSGPUWrapper::GetInstance(); }))
       .def("set_slot_vector", &framework::PSGPUWrapper::SetSlotVector,
            py::call_guard<py::gil_scoped_release>())
+#ifdef PADDLE_WITH_CUDA
       .def("set_slot_dim_vector", &framework::PSGPUWrapper::SetSlotDimVector,
            py::call_guard<py::gil_scoped_release>())
+#endif
       .def("set_slot_offset_vector",
            &framework::PSGPUWrapper::SetSlotOffsetVector,
            py::call_guard<py::gil_scoped_release>())
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a303951d8596d..b135af43ab174 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -167,6 +167,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/kernels/autotune/cache.h"
@@ -4490,6 +4491,20 @@ All parameter, weight, gradient are variables in Paddle.
     return res;
   });
 
+  m.def("enable_layout_autotune", [] {
+    return paddle::imperative::LayoutAutoTune::Instance()
+        .EnableLayoutAutoTune();
+  });
+
+  m.def("disable_layout_autotune", [] {
+    return paddle::imperative::LayoutAutoTune::Instance()
+        .DisableLayoutAutoTune();
+  });
+
+  m.def("use_layout_autotune", [] {
+    return paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune();
+  });
+
   BindFleetWrapper(&m);
   BindIO(&m);
 
diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc
index ec6293f9a2bdb..f0bf46567a5bf 100644
--- a/paddle/infrt/api/infrt_api.cc
+++ b/paddle/infrt/api/infrt_api.cc
@@ -270,6 +270,12 @@ int InfRtPredictor::Init(const InfRtConfig& config) {
         {::infrt::TargetType::CPU,
          ::infrt::PrecisionType::FLOAT32,
          ::infrt::LayoutType::NCHW}};
+    if (config.gpu_enabled()) {
+      valid_places.insert(valid_places.begin(),
+                          ::infrt::Place(::infrt::TargetType::GPU,
+                                         ::infrt::PrecisionType::FLOAT32,
+                                         ::infrt::LayoutType::NCHW));
+    }
     pass_manager.addPass(CreatePhiOpCvtPass(valid_places));
     pass_manager.addPass(CreateInfrtOpFusePass());
   }
@@ -300,12 +306,19 @@ int InfRtPredictor::Init(const InfRtConfig& config) {
   }
 
   // Load params
-  auto tensor_map = ::infrt::kernel::phi::LoadCombinedParameters(
-      config.model_dir(), config.param_dir());
+  if (config.gpu_enabled() && !config.tensorrt_enabled()) {
+    auto tensor_map = ::infrt::kernel::phi::LoadCombinedParamsToGpu(
+        config.model_dir(), config.param_dir());
+    impl_->executor.reset(
+        new PredictExecutor(module_op, registry, std::move(tensor_map)));
+
+  } else {
+    auto tensor_map = ::infrt::kernel::phi::LoadCombinedParameters(
+        config.model_dir(), config.param_dir());
+    impl_->executor.reset(
+        new PredictExecutor(module_op, registry, std::move(tensor_map)));
+  }
 
-  // Create PredictExecutor
-  impl_->executor.reset(
-      new PredictExecutor(module_op, registry, std::move(tensor_map)));
   return 0;
 }
 
diff --git a/paddle/infrt/api/infrt_api.h b/paddle/infrt/api/infrt_api.h
index 231f496bb89d1..fcaed78bdd9ae 100644
--- a/paddle/infrt/api/infrt_api.h
+++ b/paddle/infrt/api/infrt_api.h
@@ -27,6 +27,7 @@ class InfRtConfig {
   std::vector<std::string> shared_libs_;
 
   // TODO(wilber): Design an easy-to-use interface.
+  bool gpu_enabled_{false};
   bool tensorrt_enabled_{false};
 
  public:
@@ -42,6 +43,9 @@ class InfRtConfig {
   }
   const std::vector<std::string>& shared_libs() const { return shared_libs_; }
 
+  void enable_gpu() { gpu_enabled_ = true; }
+  bool gpu_enabled() const { return gpu_enabled_; }
+
   // TODO(wilber): Design an easy-to-use interface.
   void enable_tensorrt() { tensorrt_enabled_ = true; }
   void disable_tensorrt() { tensorrt_enabled_ = false; }
diff --git a/paddle/infrt/api/infrt_api_test.cc.in b/paddle/infrt/api/infrt_api_test.cc.in
index 13635ddaaab2f..f7d1c97603c63 100644
--- a/paddle/infrt/api/infrt_api_test.cc.in
+++ b/paddle/infrt/api/infrt_api_test.cc.in
@@ -57,6 +57,57 @@ TEST(InfRtPredictor, predictor) {
   ASSERT_EQ(output->dims(), ::phi::DDim({16, 10}));
 }
 
+TEST(InfRtPredictor, cpu_predictor) {
+  std::vector<std::string> shared_libs;
+
+  InfRtConfig config;
+
+  config.set_model_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdmodel");
+  config.set_param_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdiparams");
+
+  std::unique_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
+
+  ::infrt::backends::CpuPhiAllocator cpu_allocator;
+  ::phi::DenseTensor* input = predictor->GetInput(0);
+  input->Resize({2, 3, 256, 256});
+  input->AllocateFrom(&cpu_allocator, ::phi::DataType::FLOAT32);
+  auto* input_data = reinterpret_cast<float*>(input->data());
+  for (int i = 0; i < input->numel(); i++) input_data[i] = 1.0;
+
+  for(int i = 0; i < 10; i++) {
+    predictor->Run();
+  }
+  auto start = std::chrono::steady_clock::now();
+  for(int i = 0; i < 10; i++) {
+    predictor->Run();
+  }
+  auto end = std::chrono::steady_clock::now();
+  auto msec = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
+  std::cout <<"One predict period costs " << msec.count()/1000 << "ms.\n";
+
+  // get and print output tensor
+  auto* output = predictor->GetOutput(0);
+
+  ASSERT_EQ(output->dims(), ::phi::DDim({2, 1000}));
+  const std::vector<float> true_vals {
+    -3.319006264209747314e-01, -1.418896913528442383e+00,
+        -6.934890151023864746e-01, -1.498023152351379395e+00,
+        3.078042864799499512e-01, -1.340998053550720215e+00,
+        3.508620023727416992e+00, 2.274388313293457031e+00,
+        -1.321727275848388672e+00, -8.888689428567886353e-02,
+        -3.319006264209747314e-01, -1.418896913528442383e+00,
+        -6.934890151023864746e-01, -1.498023152351379395e+00,
+        3.078042864799499512e-01, -1.340998053550720215e+00,
+        3.508620023727416992e+00, 2.274388313293457031e+00,
+        -1.321727275848388672e+00, -8.888689428567886353e-02
+  };
+
+  for (size_t i = 0; i < true_vals.size(); i+=100) {
+    CHECK_NEAR(output->data<float>()[i*100], true_vals[i], 1e-5);
+  }
+}
+
+
 #ifdef INFRT_WITH_TRT
 TEST(InfRtPredictor, trt_predictor) {
   std::vector<std::string> shared_libs;
@@ -100,4 +151,67 @@ TEST(InfRtPredictor, trt_predictor) {
 }
 #endif
 
+#ifdef INFRT_WITH_GPU
+TEST(InfRtPredictor, gpu_predictor) {
+  std::vector<std::string> shared_libs;
+
+  InfRtConfig config;
+  config.enable_gpu();
+
+  config.set_model_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdmodel");
+  config.set_param_dir("@CMAKE_BINARY_DIR@/models/resnet50/model.pdiparams");
+
+  std::unique_ptr<InfRtPredictor> predictor = CreateInfRtPredictor(config);
+
+  ::infrt::backends::GpuPhiAllocator gpu_allocator;
+
+
+  ::phi::DenseTensor* input = predictor->GetInput(0);
+  input->Resize({2, 3, 256, 256});
+  input->AllocateFrom(&gpu_allocator, ::phi::DataType::FLOAT32);
+  auto* data = reinterpret_cast<float*>(input->data());
+
+  std::vector<float> input_data(2 * 3 * 256 * 256, 1.0);
+  cudaMemcpy(data,
+             input_data.data(),
+             sizeof(float) * input->numel(),
+             cudaMemcpyHostToDevice);
+
+  for(int i = 0; i < 10; i++) {
+    predictor->Run();
+  }
+  auto start = std::chrono::steady_clock::now();
+  for(int i = 0; i < 1000; i++) {
+    predictor->Run();
+  }
+  auto end = std::chrono::steady_clock::now();
+  auto msec = std::chrono::duration_cast<std::chrono::milliseconds>(end-start);
+  std::cout <<"One predict period costs " << msec.count()/1000 << "ms.\n";
+
+  auto* output = predictor->GetOutput(0);
+  std::vector<float> output_data(output->numel());
+  cudaMemcpy(output_data.data(),
+             output->data<float>(),
+             sizeof(float) * output->numel(),
+             cudaMemcpyDeviceToHost);
+
+  ASSERT_EQ(output->dims(), ::phi::DDim({2, 1000}));
+  const std::vector<float> true_vals {
+    -3.319006264209747314e-01, -1.418896913528442383e+00,
+        -6.934890151023864746e-01, -1.498023152351379395e+00,
+        3.078042864799499512e-01, -1.340998053550720215e+00,
+        3.508620023727416992e+00, 2.274388313293457031e+00,
+        -1.321727275848388672e+00, -8.888689428567886353e-02,
+        -3.319006264209747314e-01, -1.418896913528442383e+00,
+        -6.934890151023864746e-01, -1.498023152351379395e+00,
+        3.078042864799499512e-01, -1.340998053550720215e+00,
+        3.508620023727416992e+00, 2.274388313293457031e+00,
+        -1.321727275848388672e+00, -8.888689428567886353e-02
+  };
+  for (size_t i = 0; i < true_vals.size(); i+=100) {
+    CHECK_NEAR(output_data[i*100], true_vals[i], 1e-5);
+  }
+}
+#endif
+
 }  // namespace infrt
diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h
index 6e3bef9299162..810c79509e7b6 100644
--- a/paddle/infrt/backends/host/phi_allocator.h
+++ b/paddle/infrt/backends/host/phi_allocator.h
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/core/allocator.h"
 
 #ifdef INFRT_WITH_GPU
@@ -40,12 +41,8 @@ class GpuPhiAllocator : public phi::Allocator {
   static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); }
 
   AllocationPtr Allocate(size_t bytes_size) {
-    void* ptr;
-    cudaMalloc(&ptr, bytes_size);
-    return AllocationPtr(
-        new phi::Allocation(
-            ptr, bytes_size, phi::Place(phi::AllocationType::GPU)),
-        deleter);
+    return paddle::memory::Alloc(phi::Place(phi::AllocationType::GPU),
+                                 bytes_size);
   }
 };
 #endif
diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc
index 56c375c72d2bb..8da34bd404be6 100644
--- a/paddle/infrt/dialect/init_dialects.cc
+++ b/paddle/infrt/dialect/init_dialects.cc
@@ -34,9 +34,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   InfrtDialect,
                   dt::DTDialect,
                   pd::PaddleDialect,
-                  trt::TensorRTDialect
+                  trt::TensorRTDialect,
 #ifdef INFRT_WITH_PHI
-                  ,
                   phi::PHIDenseTensorDialect,
                   phi::PHICPUKernelDialect,
                   phi::PHIGPUKernelDialect,
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index 2078ebb1442ff..7e612be05b1f6 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -40,6 +40,13 @@ def CreateHostInitedDenseTensorOp : PDT_Op<"create_host_inited_dense_tensor.f32"
   let results = (outs DenseTensor:$output);
 }
 
+def CreateInitedGpuFLOAT32DenseTensorOp
+      : PDT_Op<"create_inited_dense_tensor.gpu.f32", [NoSideEffect]> {
+  let arguments = (ins Context:$context, I64ArrayAttr:$dims,
+    LayoutAttr:$layout, I64ArrayAttr:$lod, F32Attr:$value);
+  let results = (outs DenseTensor:$output);
+}
+
 def CreateInitedCpuFLOAT32DenseTensorOp
       : PDT_Op<"create_inited_dense_tensor.cpu.f32", [NoSideEffect]> {
   let arguments = (ins Context:$context, I64ArrayAttr:$dims, 
@@ -86,6 +93,14 @@ def PDT_LoadCombinedParamsOp : PDT_Op<"load_combined_params", [NoSideEffect]> {
   let assemblyFormat = "`(``)`attr-dict";
 }
 
+def PDT_LoadCombinedParamsGpuOp : PDT_Op<"load_combined_params_to_gpu", [NoSideEffect]> {
+  // input path of model params.
+  let arguments = (ins StrAttr:$model_path, StrAttr:$params_path);
+  let results = (outs PD_DenseTensorMap:$out);
+
+  let assemblyFormat = "`(``)`attr-dict";
+}
+
 def PDT_TensorMapGetSizeOp : PDT_Op<"tensor_map_get_size", [NoSideEffect]> {
   let arguments = (ins PD_DenseTensorMap:$map);
   let results = (outs I32:$size);
diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
index e9b426a5088fc..4bf39d4f66094 100644
--- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
+++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc
@@ -76,6 +76,7 @@ class PhiOpConvertPass
   void getDependentDialects(mlir::DialectRegistry &registry) const override;
 
  private:
+  void updateInputsAndResults(infrt::TargetType target);
   void convertStage();
   void dispatchStage();
 
@@ -110,10 +111,50 @@ mlir::LogicalResult PhiOpConvertPass::initialize(mlir::MLIRContext *context) {
 
 // Implementation of the PhiOpConvertPass.
 void PhiOpConvertPass::runOnFunction() {
+  updateInputsAndResults(valid_places_[0].target);
   convertStage();
   dispatchStage();
 }
 
+void PhiOpConvertPass::updateInputsAndResults(infrt::TargetType target) {
+  mlir::Block &body = getFunction().front();
+  auto loc = getFunction().getLoc();
+  mlir::Operation &operation = body.front();
+  mlir::MLIRContext *context = operation.getContext();
+  size_t num_input = body.getNumArguments();
+
+  // step1. update input cpu tensors into gpu tensors
+  for (size_t index = 0; index < num_input; index++) {
+    auto argument = body.getArgument(index);
+    if (auto t = argument.getType().dyn_cast<::infrt::DenseTensorType>()) {
+      mlir::Type replace_type = infrt::DenseTensorType::get(
+          context, target, t.getPrecision(), infrt::LayoutType::NCHW);
+      getFunction().insertArgument(index, replace_type, {}, loc);
+      argument.replaceAllUsesWith(getFunction().getArgument(index));
+      getFunction().eraseArgument(index + 1);
+    }
+  }
+  // update output tensors
+  unsigned int num_result = getFunction().getNumResults();
+  for (unsigned int index = 0; index < num_result; index++) {
+    mlir::Type replace_type =
+        infrt::DenseTensorType::get(context,
+                                    target,
+                                    infrt::PrecisionType::FLOAT32,
+                                    infrt::LayoutType::NCHW);
+    getFunction().eraseResult(index);
+    getFunction().insertResult(index, replace_type, {});
+  }
+  // update dense_tensor_map
+  mlir::Type replace_type = infrt::DenseTensorType::get(
+      context, target, infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW);
+
+  for (auto &op : body.without_terminator()) {
+    if (op.getName().getIdentifier().str() == "phi_dt.tensor_map_get_tensor")
+      op.getResult(0).setType(replace_type);
+  }
+}
+
 void PhiOpConvertPass::convertStage() {
   mlir::Block &body = getFunction().front();
   std::vector<mlir::Operation *> worklist;
@@ -200,6 +241,7 @@ void PhiOpConvertPass::dispatchStage() {
 
   mlir::OpBuilder builder(&block, block.begin());
   std::map<infrt::TargetType, mlir::Value> phi_context;
+
   for (infrt::KernelOp kernel_op : worklist) {
     std::string kernel_name = kernel_op.name().str();
     std::vector<infrt::PhiKernelDesc> candidates =
@@ -257,15 +299,25 @@ void PhiOpConvertPass::dispatchStage() {
     for (size_t index = 0; index < phi_kernel_desc.input_types.size();
          ++index) {
       mlir::Value input = kernel_op.getOperand(index);
-      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
-          kernel_op.getLoc(),
-          infrt::DenseTensorType::get(
-              kernel_op.getContext(),
-              phi_kernel_desc.input_types[index].target,
-              phi_kernel_desc.input_types[index].precision,
-              phi_kernel_desc.input_types[index].layout),
-          input);
-      operation_state.addOperands(cvt_tensor_type_op.output());
+      if (input.getType().dyn_cast<::infrt::DenseTensorType>().getTarget() ==
+              ::infrt::TargetType::CPU &&
+          phi_kernel_desc.input_types[index].target ==
+              ::infrt::TargetType::GPU) {
+        auto cvt_tensor_type_op = builder.create<infrt::phi::GpuMemCopyOp>(
+            kernel_op.getLoc(),
+            infrt::DenseTensorType::get(
+                kernel_op.getContext(),
+                phi_kernel_desc.input_types[index].target,
+                phi_kernel_desc.input_types[index].precision,
+                phi_kernel_desc.input_types[index].layout),
+            input,
+            phi_context[infrt::TargetType::GPU],
+            mlir::BoolAttr::get(kernel_op.getContext(), /*d2h*/ false));
+
+        operation_state.addOperands(cvt_tensor_type_op.output());
+      } else {
+        operation_state.addOperands(input);
+      }
     }
 
     for (size_t index = 0; index < phi_kernel_desc.output_types.size();
@@ -280,11 +332,8 @@ void PhiOpConvertPass::dispatchStage() {
     mlir::Operation *phi_operation = builder.createOperation(operation_state);
     for (size_t index = 0; index < phi_kernel_desc.output_types.size();
          ++index) {
-      mlir::Value input = phi_operation->getResult(index);
-      auto cvt_tensor_type_op = builder.create<infrt::TensorCastOp>(
-          kernel_op.getLoc(), kernel_op.getResultTypes()[index], input);
       kernel_op.getResult(index).replaceAllUsesWith(
-          cvt_tensor_type_op.output());
+          phi_operation->getResult(index));
     }
     kernel_op.erase();
   }
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index 7ffc8de151075..95e25b243f3ab 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
+#include <memory>
 #include "llvm/Support/ErrorHandling.h"
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/common/string.h"
 #include "paddle/infrt/dialect/phi/data_type.h"
 #include "paddle/infrt/kernel/phi/context_kernels.h"
@@ -22,24 +24,13 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 #ifdef INFRT_WITH_GPU
 #include <cuda_runtime.h>
 #endif
 
-namespace paddle {
-namespace platform {
-using DeviceContext = ::phi::DeviceContext;
-}  // namespace platform
-namespace framework {
-using LoDTensor = ::phi::DenseTensor;
-void DeserializeFromStream(std::istream& is,
-                           LoDTensor* tensor,
-                           const platform::DeviceContext& dev_ctx);
-}
-}  // namespace paddle
-
 namespace infrt {
 namespace kernel {
 namespace phi {
@@ -71,7 +62,7 @@ ::phi::DenseTensor CreateInitedDenseTensorF32(
           ::phi::make_ddim(dims.get()),
           ConvertLayoutToPhi(layout.get()),
           {}));
-  float* a_data = dense_tensor.mutable_data<float>(::phi::CPUPlace());
+  float* a_data = dense_tensor.mutable_data<float>(context.GetPlace());
   for (int64_t i = 0; i < dense_tensor.numel(); ++i) {
     a_data[i] = value.get();
   }
@@ -198,6 +189,12 @@ ::infrt::phi::DenseTensorMap LoadParameters(const std::string& file_path) {
   auto pb_proto_prog = paddle::LoadProgram(model_path);
   auto main_block = pb_proto_prog->blocks(0);
 
+  ::phi::CPUContext ctx;
+  auto allocator = std::make_unique<backends::CpuPhiAllocator>();
+  const auto* allocator_ptr = allocator.get();
+  ctx.SetAllocator(allocator_ptr);
+  ctx.SetHostAllocator(allocator_ptr);
+  ctx.SetZeroAllocator(allocator_ptr);
   for (auto& var : main_block.vars()) {
     if (var.name() == "feed" || var.name() == "fetch" || !var.persistable())
       continue;
@@ -207,9 +204,7 @@ ::infrt::phi::DenseTensorMap LoadParameters(const std::string& file_path) {
       case ::paddle::framework::proto::VarType_Type_LOD_TENSOR: {
         std::unique_ptr<::phi::DenseTensor> tensor{
             std::make_unique<::phi::DenseTensor>()};
-        ::phi::CPUContext ctx;
-        ::paddle::framework::DeserializeFromStream(
-            param_file, tensor.get(), ctx);
+        ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx);
         map.SetDenseTensor(var.name(), std::move(tensor));
       } break;
       default: {
@@ -249,13 +244,55 @@ ::infrt::phi::DenseTensorMap LoadCombinedParameters(
     }
   }
 
+  ::phi::CPUContext ctx;
+  auto allocator = std::make_unique<backends::CpuPhiAllocator>();
+  const auto* allocator_ptr = allocator.get();
+  ctx.SetAllocator(allocator_ptr);
+  ctx.SetHostAllocator(allocator_ptr);
+  ctx.SetZeroAllocator(allocator_ptr);
+  for (auto& var : tmp) {
+    std::unique_ptr<::phi::DenseTensor> tensor{
+        std::make_unique<::phi::DenseTensor>()};
+    ::infrt::paddle::DeserializeFromStream(param_file, tensor.get(), ctx);
+    map.SetDenseTensor(var, std::move(tensor));
+  }
+
+  return map;
+}
+
+::infrt::phi::DenseTensorMap LoadCombinedParamsToGpu(
+    const std::string& model_path, const std::string& params_path) {
+  ::infrt::phi::DenseTensorMap map;
+
+  auto pb_proto_prog = paddle::LoadProgram(model_path);
+  auto main_block = pb_proto_prog->blocks(0);
+
+  std::ifstream param_file(params_path, std::ios::binary);
+
+  std::set<std::string> tmp;
+  for (auto& var : main_block.vars()) {
+    if (var.name() == "feed" || var.name() == "fetch" || !var.persistable()) {
+      continue;
+    }
+    if (var.type().type() ==
+        ::paddle::framework::proto::VarType_Type_LOD_TENSOR) {
+      tmp.emplace(var.name());
+    } else {
+      llvm_unreachable("the tensor type is illegal.");
+    }
+  }
+
+#ifdef INFRT_WITH_GPU
+  ::phi::GPUContext ctx;
+  ctx.PartialInitWithoutAllocator();
+
   for (auto& var : tmp) {
     std::unique_ptr<::phi::DenseTensor> tensor{
         std::make_unique<::phi::DenseTensor>()};
-    ::phi::CPUContext ctx;
     ::paddle::framework::DeserializeFromStream(param_file, tensor.get(), ctx);
     map.SetDenseTensor(var, std::move(tensor));
   }
+#endif
 
   return map;
 }
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index c401fb99978a3..573b8f102ec7c 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -73,6 +73,9 @@ ::infrt::phi::DenseTensorMap LoadCombinedParams(
 ::infrt::phi::DenseTensorMap LoadCombinedParameters(
     const std::string& model_path, const std::string& params_path);
 
+::infrt::phi::DenseTensorMap LoadCombinedParamsToGpu(
+    const std::string& model_path, const std::string& params_path);
+
 int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
 
 #ifdef INFRT_WITH_GPU
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 848ff28faffc7..fa51ab3566d91 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -68,6 +68,9 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("phi_dt.load_params",
                       INFRT_KERNEL(infrt::kernel::phi::LoadParams),
                       {"path"});
+  registry->AddKernel("phi_dt.load_combined_params_to_gpu",
+                      INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParamsToGpu),
+                      {"model_path", "params_path"});
   registry->AddKernel("phi_dt.load_combined_params",
                       INFRT_KERNEL(infrt::kernel::phi::LoadCombinedParams),
                       {"model_path", "params_path"});
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
index f3de1a630451c..da4f8b6420b22 100644
--- a/paddle/infrt/paddle/model_parser.cc
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -22,6 +22,10 @@
 #include "paddle/infrt/common/target.h"
 #include "paddle/infrt/common/type.h"
 
+#ifdef INFRT_WITH_PHI
+#include "paddle/phi/common/data_type.h"
+#endif
+
 namespace infrt {
 namespace paddle {
 
@@ -170,5 +174,96 @@ void LoadParam(const std::string &path, _Variable *out, const Target &target) {
   LoadLoDTensor(fin, out, target);
 }
 
+#ifdef INFRT_WITH_PHI
+namespace framework_proto = ::paddle::framework::proto;
+
+inline ::phi::DataType PhiDataType(framework_proto::VarType::Type type) {
+  using Type = framework_proto::VarType::Type;
+  switch (static_cast<int>(type)) {
+    case Type::VarType_Type_BOOL:
+      return ::phi::DataType::BOOL;
+    case Type::VarType_Type_INT8:
+      return ::phi::DataType::INT8;
+    case Type::VarType_Type_UINT8:
+      return ::phi::DataType::UINT8;
+    case Type::VarType_Type_INT16:
+      return ::phi::DataType::INT16;
+    case Type::VarType_Type_INT32:
+      return ::phi::DataType::INT32;
+    case Type::VarType_Type_INT64:
+      return ::phi::DataType::INT64;
+    case Type::VarType_Type_SIZE_T:
+      return ::phi::DataType::UINT64;
+    case Type::VarType_Type_FP16:
+      return ::phi::DataType::FLOAT16;
+    case Type::VarType_Type_FP32:
+      return ::phi::DataType::FLOAT32;
+    case Type::VarType_Type_FP64:
+      return ::phi::DataType::FLOAT64;
+    default:
+      LOG(FATAL) << "unknown data type " << type;
+  }
+  return ::phi::DataType::UNDEFINED;
+}
+
+inline void TensorFromStream(std::istream &is,
+                             ::phi::DenseTensor *tensor,
+                             const ::phi::CPUContext &ctx) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  CHECK_EQ(version, 0U);
+  framework_proto::VarType::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size = -1;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    CHECK_EQ(is.good(), true);
+    CHECK_GE(size, 0);
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    CHECK_EQ(desc.ParseFromArray(buf.get(), size), true);
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(::phi::make_ddim(dims));
+    void *buf;
+    size_t size = tensor->numel() * SizeOfType(desc.data_type());
+    ctx.HostAlloc(tensor, PhiDataType(desc.data_type()), size);
+    buf = tensor->data();
+    is.read(static_cast<char *>(buf), size);
+  }
+}
+
+void DeserializeFromStream(std::istream &is,
+                           ::phi::DenseTensor *tensor,
+                           const ::phi::CPUContext &dev_ctx) {
+  {
+    // the 1st field, unit32_t version for LoDTensor
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    CHECK_EQ(version, 0U);
+  }
+  {
+    // the 2st field, LoD information
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+  // the 3st filed, Tensor
+  TensorFromStream(is, tensor, dev_ctx);
+}
+#endif
+
 }  // namespace paddle
 }  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
index 373f77033dcef..5f039ad5d3ad8 100644
--- a/paddle/infrt/paddle/model_parser.h
+++ b/paddle/infrt/paddle/model_parser.h
@@ -25,6 +25,11 @@
 #include "paddle/infrt/paddle/scope.h"
 #include "paddle/infrt/paddle/tensor.h"
 
+#ifdef INFRT_WITH_PHI
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#endif
+
 namespace infrt {
 namespace paddle {
 namespace framework_proto = ::paddle::framework::proto;
@@ -53,5 +58,11 @@ void TensorFromStream(
     const common::Target& target = common::DefaultHostTarget());
 void ReadBinaryFile(const std::string& filename, std::string* contents);
 
+#ifdef INFRT_WITH_PHI
+void DeserializeFromStream(std::istream& is,
+                           ::phi::DenseTensor* tensor,
+                           const ::phi::CPUContext& dev_ctx);
+#endif
+
 }  // namespace paddle
 }  // namespace infrt
diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
index 0d9e312ce0bfd..6c5a98f45ce44 100644
--- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir
+++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir
@@ -1,15 +1,15 @@
 // RUN: infrtopt -phi-op-convert=valid-targets=CPU-FP32-NCHW -infrt-op-fuse %s
 
 // CHECK-LABEL: @ops
-func @ops(%a:!infrt.lod_tensor<?xf32,0>, %b:!infrt.lod_tensor<?xf32,0>) {
-  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
-  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
-  infrt.return %h:tensor<?xf32>
+func @ops(%a:!infrt.dense_tensor<CPU, FP32, NCHW>, %b:!infrt.dense_tensor<CPU, FP32, NCHW>) {
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %h = "pd.abs"(%g):(!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %h:!infrt.dense_tensor<CPU, FP32, NCHW>
 }
 
 // CHECK-LABEL: @op_execute
-func @op_execute(%a:!infrt.lod_tensor<?xf32,0>, %b:!infrt.lod_tensor<?xf32,0>, %c:!infrt.lod_tensor<?xf32,0>)  -> !infrt.lod_tensor<?xf32,0> {
-  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor<?xf32,0>, !infrt.lod_tensor<?xf32>) -> tensor<?xf32>
-  %h = "pd.abs"(%g):(tensor<?xf32>) -> tensor<?xf32>
-  infrt.return %h:tensor<?xf32>
+func @op_execute(%a:!infrt.dense_tensor<CPU, FP32, NCHW>, %b:!infrt.dense_tensor<CPU, FP32, NCHW>, %c:!infrt.dense_tensor<CPU, FP32, NCHW>)  -> !infrt.dense_tensor<CPU, FP32, NCHW> {
+  %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  %h = "pd.abs"(%g):(!infrt.dense_tensor<CPU, FP32, NCHW>) -> !infrt.dense_tensor<CPU, FP32, NCHW>
+  infrt.return %h:!infrt.dense_tensor<CPU, FP32, NCHW>
 }
diff --git a/paddle/infrt/tests/models/efficientnet-b4/model.py b/paddle/infrt/tests/models/efficientnet-b4/model.py
new file mode 100644
index 0000000000000..c660c3a46749e
--- /dev/null
+++ b/paddle/infrt/tests/models/efficientnet-b4/model.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# url: https://aistudio.baidu.com/aistudio/projectdetail/3756986?forkThirdPart=1
+from net import EfficientNet
+from paddle.jit import to_static
+from paddle.static import InputSpec
+import paddle
+import sys
+
+model = EfficientNet.from_name('efficientnet-b4')
+net = to_static(
+    model, input_spec=[InputSpec(
+        shape=[None, 3, 256, 256], name='x')])
+paddle.jit.save(net, sys.argv[1])
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
new file mode 100644
index 0000000000000..d4e557829ae2c
--- /dev/null
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .efficientnet import EfficientNet
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
new file mode 100644
index 0000000000000..a9956fcdc8862
--- /dev/null
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/efficientnet.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .utils import (round_filters, round_repeats, drop_connect,
+                    get_same_padding_conv2d, get_model_params,
+                    efficientnet_params, load_pretrained_weights)
+
+
+class MBConvBlock(nn.Layer):
+    """
+    Mobile Inverted Residual Bottleneck Block
+
+    Args:
+        block_args (namedtuple): BlockArgs, see above
+        global_params (namedtuple): GlobalParam, see above
+
+    Attributes:
+        has_se (bool): Whether the block contains a Squeeze and Excitation layer.
+    """
+
+    def __init__(self, block_args, global_params):
+        super().__init__()
+        self._block_args = block_args
+        self._bn_mom = global_params.batch_norm_momentum
+        self._bn_eps = global_params.batch_norm_epsilon
+        self.has_se = (self._block_args.se_ratio is not None) and (
+            0 < self._block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip  # skip connection and drop connect
+
+        # Get static or dynamic convolution depending on image size
+        Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
+
+        # Expansion phase
+        inp = self._block_args.input_filters  # number of input channels
+        oup = self._block_args.input_filters * self._block_args.expand_ratio  # number of output channels
+        if self._block_args.expand_ratio != 1:
+            self._expand_conv = Conv2d(
+                in_channels=inp,
+                out_channels=oup,
+                kernel_size=1,
+                bias_attr=False)
+            self._bn0 = nn.BatchNorm2D(
+                num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+
+        # Depthwise convolution phase
+        k = self._block_args.kernel_size
+        s = self._block_args.stride
+        self._depthwise_conv = Conv2d(
+            in_channels=oup,
+            out_channels=oup,
+            groups=oup,  # groups makes it depthwise
+            kernel_size=k,
+            stride=s,
+            bias_attr=False)
+        self._bn1 = nn.BatchNorm2D(
+            num_features=oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+
+        # Squeeze and Excitation layer, if desired
+        if self.has_se:
+            num_squeezed_channels = max(1,
+                                        int(self._block_args.input_filters *
+                                            self._block_args.se_ratio))
+            self._se_reduce = Conv2d(
+                in_channels=oup,
+                out_channels=num_squeezed_channels,
+                kernel_size=1)
+            self._se_expand = Conv2d(
+                in_channels=num_squeezed_channels,
+                out_channels=oup,
+                kernel_size=1)
+
+        # Output phase
+        final_oup = self._block_args.output_filters
+        self._project_conv = Conv2d(
+            in_channels=oup,
+            out_channels=final_oup,
+            kernel_size=1,
+            bias_attr=False)
+        self._bn2 = nn.BatchNorm2D(
+            num_features=final_oup, momentum=self._bn_mom, epsilon=self._bn_eps)
+        self._swish = nn.Hardswish()
+
+    def forward(self, inputs, drop_connect_rate=None):
+        """
+        :param inputs: input tensor
+        :param drop_connect_rate: drop connect rate (float, between 0 and 1)
+        :return: output of block
+        """
+
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = self._swish(self._bn0(self._expand_conv(inputs)))
+        x = self._swish(self._bn1(self._depthwise_conv(x)))
+
+        # Squeeze and Excitation
+        if self.has_se:
+            x_squeezed = F.adaptive_avg_pool2d(x, 1)
+            x_squeezed = self._se_expand(
+                self._swish(self._se_reduce(x_squeezed)))
+            x = F.sigmoid(x_squeezed) * x
+
+        x = self._bn2(self._project_conv(x))
+
+        # Skip connection and drop connect
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            if drop_connect_rate:
+                x = drop_connect(
+                    x, prob=drop_connect_rate, training=self.training)
+            x = x + inputs  # skip connection
+        return x
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export)"""
+        self._swish = nn.Hardswish() if memory_efficient else nn.Swish()
+
+
+class EfficientNet(nn.Layer):
+    """
+    An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
+
+    Args:
+        blocks_args (list): A list of BlockArgs to construct blocks
+        global_params (namedtuple): A set of GlobalParams shared between blocks
+
+    Example:
+        model = EfficientNet.from_pretrained('efficientnet-b0')
+
+    """
+
+    def __init__(self, blocks_args=None, global_params=None):
+        super().__init__()
+        assert isinstance(blocks_args, list), 'blocks_args should be a list'
+        assert len(blocks_args) > 0, 'block args must be greater than 0'
+        self._global_params = global_params
+        self._blocks_args = blocks_args
+
+        # Get static or dynamic convolution depending on image size
+        Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
+
+        # Batch norm parameters
+        bn_mom = self._global_params.batch_norm_momentum
+        bn_eps = self._global_params.batch_norm_epsilon
+
+        # Stem
+        in_channels = 3  # rgb
+        out_channels = round_filters(
+            32, self._global_params)  # number of output channels
+        self._conv_stem = Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=2, bias_attr=False)
+        self._bn0 = nn.BatchNorm2D(
+            num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
+
+        # Build blocks
+        self._blocks = nn.LayerList([])
+        for block_args in self._blocks_args:
+
+            # Update block input and output filters based on depth multiplier.
+            block_args = block_args._replace(
+                input_filters=round_filters(block_args.input_filters,
+                                            self._global_params),
+                output_filters=round_filters(block_args.output_filters,
+                                             self._global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         self._global_params))
+
+            # The first block needs to take care of stride and filter size increase.
+            self._blocks.append(MBConvBlock(block_args, self._global_params))
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(
+                    MBConvBlock(block_args, self._global_params))
+
+        # Head
+        in_channels = block_args.output_filters  # output of final block
+        out_channels = round_filters(1280, self._global_params)
+        self._conv_head = Conv2d(
+            in_channels, out_channels, kernel_size=1, bias_attr=False)
+        self._bn1 = nn.BatchNorm2D(
+            num_features=out_channels, momentum=bn_mom, epsilon=bn_eps)
+
+        # Final linear layer
+        self._avg_pooling = nn.AdaptiveAvgPool2D(1)
+        self._dropout = nn.Dropout(self._global_params.dropout_rate)
+        self._fc = nn.Linear(out_channels, self._global_params.num_classes)
+        self._swish = nn.Hardswish()
+
+    def set_swish(self, memory_efficient=True):
+        """Sets swish function as memory efficient (for training) or standard (for export)"""
+        self._swish = nn.Hardswish() if memory_efficient else nn.Swish()
+        for block in self._blocks:
+            block.set_swish(memory_efficient)
+
+    def extract_features(self, inputs):
+        """ Returns output of the final convolution layer """
+
+        # Stem
+        x = self._swish(self._bn0(self._conv_stem(inputs)))
+
+        # Blocks
+        for idx, block in enumerate(self._blocks):
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / len(self._blocks)
+            x = block(x, drop_connect_rate=drop_connect_rate)
+
+        # Head
+        x = self._swish(self._bn1(self._conv_head(x)))
+
+        return x
+
+    def forward(self, inputs):
+        """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
+        bs = inputs.shape[0]
+        # Convolution layers
+        x = self.extract_features(inputs)
+
+        # Pooling and final linear layer
+        x = self._avg_pooling(x)
+        x = paddle.reshape(x, (bs, -1))
+        x = self._dropout(x)
+        x = self._fc(x)
+        return x
+
+    @classmethod
+    def from_name(cls, model_name, override_params=None):
+        cls._check_model_name_is_valid(model_name)
+        blocks_args, global_params = get_model_params(model_name,
+                                                      override_params)
+        return cls(blocks_args, global_params)
+
+    @classmethod
+    def from_pretrained(cls,
+                        model_name,
+                        advprop=False,
+                        num_classes=1000,
+                        in_channels=3):
+        model = cls.from_name(
+            model_name, override_params={'num_classes': num_classes})
+        load_pretrained_weights(
+            model, model_name, load_fc=(num_classes == 1000), advprop=advprop)
+        if in_channels != 3:
+            Conv2d = get_same_padding_conv2d(
+                image_size=model._global_params.image_size)
+            out_channels = round_filters(32, model._global_params)
+            model._conv_stem = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                bias_attr=False)
+        return model
+
+    @classmethod
+    def get_image_size(cls, model_name):
+        cls._check_model_name_is_valid(model_name)
+        _, _, res, _ = efficientnet_params(model_name)
+        return res
+
+    @classmethod
+    def _check_model_name_is_valid(cls, model_name):
+        """ Validates model name. """
+        valid_models = ['efficientnet-b' + str(i) for i in range(9)]
+        if model_name not in valid_models:
+            raise ValueError('model_name should be one of: ' + ', '.join(
+                valid_models))
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
new file mode 100644
index 0000000000000..3bf8b4eb73022
--- /dev/null
+++ b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import math
+from functools import partial
+import collections
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+# Parameters for the entire model (stem, all blocks, and head)
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 'num_classes',
+    'width_coefficient', 'depth_coefficient', 'depth_divisor', 'min_depth',
+    'drop_connect_rate', 'image_size'
+])
+
+# Parameters for an individual model block
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'
+])
+
+# Change namedtuple defaults
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+
+
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def drop_connect(inputs, prob, training):
+    """Drop input connection"""
+    if not training:
+        return inputs
+    keep_prob = 1.0 - prob
+    inputs_shape = paddle.shape(inputs)
+    random_tensor = keep_prob + paddle.rand(shape=[inputs_shape[0], 1, 1, 1])
+    binary_tensor = paddle.floor(random_tensor)
+    output = inputs / keep_prob * binary_tensor
+    return output
+
+
+def get_same_padding_conv2d(image_size=None):
+    """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
+        Static padding is necessary for ONNX exporting of models. """
+    if image_size is None:
+        return Conv2dDynamicSamePadding
+    else:
+        return partial(Conv2dStaticSamePadding, image_size=image_size)
+
+
+class Conv2dDynamicSamePadding(nn.Conv2D):
+    """ 2D Convolutions like TensorFlow, for a dynamic image size """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 bias_attr=None):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            0,
+            dilation,
+            groups,
+            bias_attr=bias_attr)
+        self.stride = self._stride if len(
+            self._stride) == 2 else [self._stride[0]] * 2
+
+    def forward(self, x):
+        ih, iw = x.shape[-2:]
+        kh, kw = self.weight.shape[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] +
+                    (kh - 1) * self._dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] +
+                    (kw - 1) * self._dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self._padding,
+                        self._dilation, self._groups)
+
+
+class Conv2dStaticSamePadding(nn.Conv2D):
+    """ 2D Convolutions like TensorFlow, for a fixed image size"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 image_size=None,
+                 **kwargs):
+        if 'stride' in kwargs and isinstance(kwargs['stride'], list):
+            kwargs['stride'] = kwargs['stride'][0]
+        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
+        self.stride = self._stride if len(
+            self._stride) == 2 else [self._stride[0]] * 2
+
+        # Calculate padding based on image size and save it
+        assert image_size is not None
+        ih, iw = image_size if type(
+            image_size) == list else [image_size, image_size]
+        kh, kw = self.weight.shape[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] +
+                    (kh - 1) * self._dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] +
+                    (kw - 1) * self._dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            self.static_padding = nn.Pad2D([
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        else:
+            self.static_padding = Identity()
+
+    def forward(self, x):
+        x = self.static_padding(x)
+        x = F.conv2d(x, self.weight, self.bias, self.stride, self._padding,
+                     self._dilation, self._groups)
+        return x
+
+
+class Identity(nn.Layer):
+    def __init__(self, ):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,resolution,dropout
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+    }
+    return params_dict[model_name]
+
+
+class BlockDecoder(object):
+    """ Block Decoder for readability, straight from the official TensorFlow repository """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        assert (('s' in options and len(options['s']) == 1) or
+                (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
+
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
+            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters, 'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """
+        Decodes a list of string notations to specify blocks inside the network.
+
+        :param string_list: a list of strings, each string is a notation of block
+        :return: a list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2,
+                 image_size=None,
+                 num_classes=1000):
+    """ Get block arguments according to parameter and coefficients. """
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        num_classes=num_classes,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        min_depth=None,
+        image_size=image_size, )
+
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, s, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w,
+            depth_coefficient=d,
+            dropout_rate=p,
+            image_size=s)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+url_map = {
+    'efficientnet-b0':
+    '/home/aistudio/data/weights/efficientnet-b0-355c32eb.pdparams',
+    'efficientnet-b1':
+    '/home/aistudio/data/weights/efficientnet-b1-f1951068.pdparams',
+    'efficientnet-b2':
+    '/home/aistudio/data/weights/efficientnet-b2-8bb594d6.pdparams',
+    'efficientnet-b3':
+    '/home/aistudio/data/weights/efficientnet-b3-5fb5a3c3.pdparams',
+    'efficientnet-b4':
+    '/home/aistudio/data/weights/efficientnet-b4-6ed6700e.pdparams',
+    'efficientnet-b5':
+    '/home/aistudio/data/weights/efficientnet-b5-b6417697.pdparams',
+    'efficientnet-b6':
+    '/home/aistudio/data/weights/efficientnet-b6-c76e70fd.pdparams',
+    'efficientnet-b7':
+    '/home/aistudio/data/weights/efficientnet-b7-dcc49843.pdparams',
+}
+
+url_map_advprop = {
+    'efficientnet-b0':
+    '/home/aistudio/data/weights/adv-efficientnet-b0-b64d5a18.pdparams',
+    'efficientnet-b1':
+    '/home/aistudio/data/weights/adv-efficientnet-b1-0f3ce85a.pdparams',
+    'efficientnet-b2':
+    '/home/aistudio/data/weights/adv-efficientnet-b2-6e9d97e5.pdparams',
+    'efficientnet-b3':
+    '/home/aistudio/data/weights/adv-efficientnet-b3-cdd7c0f4.pdparams',
+    'efficientnet-b4':
+    '/home/aistudio/data/weights/adv-efficientnet-b4-44fb3a87.pdparams',
+    'efficientnet-b5':
+    '/home/aistudio/data/weights/adv-efficientnet-b5-86493f6b.pdparams',
+    'efficientnet-b6':
+    '/home/aistudio/data/weights/adv-efficientnet-b6-ac80338e.pdparams',
+    'efficientnet-b7':
+    '/home/aistudio/data/weights/adv-efficientnet-b7-4652b6dd.pdparams',
+    'efficientnet-b8':
+    '/home/aistudio/data/weights/adv-efficientnet-b8-22a8fe65.pdparams',
+}
+
+
+def load_pretrained_weights(model,
+                            model_name,
+                            weights_path=None,
+                            load_fc=True,
+                            advprop=False):
+    """Loads pretrained weights from weights path or download using url.
+    Args:
+        model (Module): The whole model of efficientnet.
+        model_name (str): Model name of efficientnet.
+        weights_path (None or str):
+            str: path to pretrained weights file on the local disk.
+            None: use pretrained weights downloaded from the Internet.
+        load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
+        advprop (bool): Whether to load pretrained weights
+                        trained with advprop (valid when weights_path is None).
+    """
+
+    # AutoAugment or Advprop (different preprocessing)
+    url_map_ = url_map_advprop if advprop else url_map
+    state_dict = paddle.load(url_map_[model_name])
+
+    if load_fc:
+        model.set_state_dict(state_dict)
+    else:
+        state_dict.pop('_fc.weight')
+        state_dict.pop('_fc.bias')
+        model.set_state_dict(state_dict)
+
+    print('Loaded pretrained weights for {}'.format(model_name))
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index e4a97e2c16f16..2b0aea9e1ec47 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -166,7 +166,7 @@ class PADDLE_API Tensor final {
    *
    * @return phi::DDim
    */
-  phi::DDim dims() const;
+  const phi::DDim& dims() const;
 
   /**
    * @brief Return the shape (dimensions) of Tensor.
@@ -260,7 +260,7 @@ class PADDLE_API Tensor final {
    *
    * @return Place
    */
-  Place place() const;
+  const Place& place() const;
 
   /**
    * @brief Determine whether the tensor device is CPU
@@ -421,7 +421,7 @@ class PADDLE_API Tensor final {
    * @param blocking, Should we copy this in sync way.
    * @return Tensor
    */
-  Tensor copy_to(Place place, bool blocking) const;
+  Tensor copy_to(const Place& place, bool blocking) const;
 
   /**
    * @brief Transfer the source Tensor to current Tensor.
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index d4e92ded324da..65cb37d414299 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -36,11 +36,17 @@ inline bool NeedTransformDataType(const DataType& input,
 inline bool NeedTransformPlace(const paddle::platform::Place& input,
                                const Backend& target,
                                const TransformFlag& transform_flag) {
-  bool ret =
-      input.GetType() == AllocationType::GPUPINNED ||
-      (transform_flag.need_trans_backend() && target != Backend::ALL_BACKEND &&
-       phi::TransToPhiBackend(input) !=
-           (target != Backend::GPUDNN ? target : Backend::GPU));
+  // NOTE(dev): The default value of TransformFlag is True, if it is set with
+  // False
+  // somewhere such as api.yaml or backward.yaml that means we should skip data
+  // transform. Because "stop_transform_" has highest priority.
+  if (!transform_flag.need_trans_backend()) {
+    return false;
+  }
+  bool ret = input.GetType() == AllocationType::GPUPINNED ||
+             (target != Backend::ALL_BACKEND &&
+              phi::TransToPhiBackend(input) !=
+                  (target != Backend::GPUDNN ? target : Backend::GPU));
   return ret;
 }
 
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 67c1b711fc997..be0a937c91e4f 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -110,13 +110,10 @@ int64_t Tensor::numel() const { return impl_->numel(); }
 
 int64_t Tensor::size() const { return impl_->numel(); }
 
-phi::DDim Tensor::dims() const { return impl_->dims(); }
+const phi::DDim &Tensor::dims() const { return impl_->dims(); }
 
 std::vector<int64_t> Tensor::shape() const {
   auto dims = impl_->dims();
-  if (dims.size() == 1 && dims.at(0) == 0) {
-    return {};
-  }
   return phi::vectorize<int64_t>(dims);
 }
 
@@ -161,7 +158,7 @@ bool Tensor::is_string_tensor() const {
 }
 /* Part 3: Device and Backend methods */
 
-Place Tensor::place() const {
+const Place &Tensor::place() const {
   PADDLE_ENFORCE_NOT_NULL(
       impl_,
       phi::errors::PermissionDenied(
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 463b72d0dbf5b..5285392b4a6ac 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -27,13 +27,13 @@ namespace paddle {
 namespace experimental {
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
-Tensor copy_to(const Tensor &x, Place place, bool blocking);
+Tensor copy_to(const Tensor &x, const Place &place, bool blocking);
 
 Tensor Tensor::cast(DataType target_type) const {
   return experimental::cast(*this, target_type);
 }
 
-Tensor Tensor::copy_to(Place place, bool blocking) const {
+Tensor Tensor::copy_to(const Place &place, bool blocking) const {
   return experimental::copy_to(*this, place, blocking);
 }
 
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index bfa45869f5ff6..3e1787cb12cfa 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -47,6 +47,7 @@ enum class Backend : uint8_t {
   GPU,
   XPU,  // XPU currently does not exist at the same time as CUDA
   NPU,  // NPU currently does not exist at the same time as CUDA
+  MLU,  // MLU currently does not exist at the same time as CUDA
 
   // the third library backend
   MKLDNN,
@@ -114,6 +115,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::NPU:
       os << "NPU";
       break;
+    case Backend::MLU:
+      os << "MLU";
+      break;
     case Backend::MKLDNN:
       os << "MKLDNN";
       break;
@@ -154,6 +158,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::XPU;
   } else if (s == std::string("NPU")) {
     return Backend::NPU;
+  } else if (s == std::string("MLU")) {
+    return Backend::MLU;
   } else if (s == std::string("MKLDNN")) {
     return Backend::MKLDNN;
   } else if (s == std::string("GPUDNN")) {
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index ed9fb7876425d..199ee81f27200 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -213,9 +213,6 @@ std::ostream& operator<<(std::ostream&, const Place&);
 namespace paddle {
 namespace experimental {
 using AllocationType = phi::AllocationType;
-using Place = phi::Place;
-using CPUPlace = phi::CPUPlace;
-using GPUPlace = phi::GPUPlace;
 using GPUPinnedPlace = phi::GPUPinnedPlace;
 using XPUPlace = phi::XPUPlace;
 using NPUPlace = phi::NPUPlace;
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 23574e98fbf17..dcf1826012c13 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -16,7 +16,7 @@ cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce)
 cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce)
 
-cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base)
+cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS convert_utils fluid_convert_utils tensor_meta tensor_base)
 cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base)
 cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base)
 cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta tensor_base)
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 4fa11ac7860ef..4388bd1f751cf 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -40,6 +40,8 @@ Backend TransToPhiBackend(const phi::Place& place) {
     return Backend::NPU;
   } else if (allocation_type == phi::AllocationType::IPU) {
     return Backend::IPU;
+  } else if (allocation_type == phi::AllocationType::MLU) {
+    return Backend::MLU;
   } else if (allocation_type == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +
@@ -119,4 +121,24 @@ const std::string& TransToFluidOpName(const std::string& phi_kernel_name) {
   return phi_kernel_name;
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+dnnl::memory::data_type TransToMKLDNNDataType(
+    const paddle::experimental::DataType& dtype) {
+  switch (dtype) {
+    case DataType::FLOAT32:
+      return dnnl::memory::data_type::f32;
+    case DataType::BFLOAT16:
+      return dnnl::memory::data_type::bf16;
+    case DataType::INT8:
+      return dnnl::memory::data_type::s8;
+    case DataType::UINT8:
+      return dnnl::memory::data_type::u8;
+    case DataType::INT32:
+      return dnnl::memory::data_type::s32;
+    default:
+      return dnnl::memory::data_type::undef;
+  }
+}
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/core/compat/convert_utils.h b/paddle/phi/core/compat/convert_utils.h
index 5982ab0deff83..9d5f30d41273c 100644
--- a/paddle/phi/core/compat/convert_utils.h
+++ b/paddle/phi/core/compat/convert_utils.h
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/tensor_meta.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "dnnl.hpp"
+#endif
+
 namespace phi {
 
 const std::string& TransToPhiKernelName(const std::string& fluid_op_name);
@@ -28,4 +32,9 @@ const std::string& TransToFluidOpName(const std::string& phi_kernel_name);
 Backend TransToPhiBackend(const phi::Place& place);
 phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id = true);
 
+#ifdef PADDLE_WITH_MKLDNN
+dnnl::memory::data_type TransToMKLDNNDataType(
+    const paddle::experimental::DataType& dtype);
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 1bfe29bc9d3ba..2b9a5f5e0ea0c 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -57,6 +57,7 @@ DenseTensor::DenseTensor(const DenseTensor& other) : meta_(other.meta()) {
 
 #ifdef PADDLE_WITH_MKLDNN
   format_ = other.format_;
+  mem_desc_ = other.mem_desc_;
 #endif
 }
 
@@ -66,6 +67,7 @@ DenseTensor& DenseTensor::operator=(const DenseTensor& other) {
   inplace_version_counter_ = other.inplace_version_counter_;
 #ifdef PADDLE_WITH_MKLDNN
   format_ = other.format_;
+  mem_desc_ = other.mem_desc_;
 #endif
   return *this;
 }
@@ -74,6 +76,10 @@ DenseTensor& DenseTensor::operator=(DenseTensor&& other) {
   meta_ = std::move(other.meta_);
   std::swap(holder_, other.holder_);
   std::swap(inplace_version_counter_, other.inplace_version_counter_);
+#ifdef PADDLE_WITH_MKLDNN
+  format_ = other.format_;
+  mem_desc_ = other.mem_desc_;
+#endif
   return *this;
 }
 
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index ef91319e1c961..9861bd68e4a9e 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -207,6 +207,9 @@ following codes there.
    *       this field.
    */
   dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef;
+
+  /// \brief memory descriptor of tensor which have layout set as kMKLDNN
+  dnnl::memory::desc mem_desc_;
 #endif
 
 #ifndef PADDLE_WITH_CUSTOM_KERNEL
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
index c6ca3c00cb558..93513067a268b 100644
--- a/paddle/phi/core/dense_tensor.inl
+++ b/paddle/phi/core/dense_tensor.inl
@@ -20,6 +20,7 @@ limitations under the License. */
 
     Will be adjusted/removed/moved in the near future
 */
+
 public:
 /* @jim19930609: Remove dependency on protobuf after Tensor Unification.
 */
@@ -127,7 +128,14 @@ following codes there.
 #ifdef PADDLE_WITH_MKLDNN
 
 public:
-inline dnnl::memory::format_tag format() const { return format_; }
+  dnnl::memory::desc mem_desc() const;
+
+inline void set_mem_desc(const dnnl::memory::desc& mem_desc) {
+  mem_desc_ = mem_desc;
+  meta_.layout = DataLayout::kMKLDNN;
+}
+
+dnnl::memory::format_tag format() const;
 
 inline void set_format(const dnnl::memory::format_tag format) {
   format_ = format;
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 5ee83089589e8..46c45837a5372 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_utils.h"
+#endif
+
 namespace phi {
 /* --------------------------- */
 /*   From framework::Tensor    */
@@ -354,6 +358,19 @@ std::vector<DenseTensor> DenseTensor::Chunk(int64_t chunks,
   return Split(split_size, axis);
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+dnnl::memory::desc DenseTensor::mem_desc() const {
+  return mem_desc_ ? mem_desc_
+                   : dnnl::memory::desc(phi::vectorize(meta_.dims),
+                                        phi::TransToMKLDNNDataType(meta_.dtype),
+                                        format_);
+}
+
+dnnl::memory::format_tag DenseTensor::format() const {
+  return mem_desc_ ? paddle::platform::GetMKLDNNFormat(mem_desc_) : format_;
+}
+#endif
+
 DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
   src.check_memory_size();
   // Preserve LoD
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 6cf805bc1a127..519d21b323fc2 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1489,6 +1489,43 @@ void InterpolateInferMeta(
   }
 }
 
+void LogspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       const MetaTensor& base,
+                       MetaTensor* out) {
+  auto s_dims = start.dims();
+  PADDLE_ENFORCE_EQ(
+      (s_dims.size() == 1) && (s_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Start) must be [1],"
+                                   "but received input shape is [%s].",
+                                   s_dims));
+  auto e_dims = stop.dims();
+  PADDLE_ENFORCE_EQ(
+      (e_dims.size() == 1) && (e_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Stop) must be [1],"
+                                   "but received input shape is [%s].",
+                                   e_dims));
+  auto num_dims = number.dims();
+  PADDLE_ENFORCE_EQ(
+      (num_dims.size() == 1) && (num_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Num) must be [1],"
+                                   "but received input shape is [%s].",
+                                   num_dims));
+  auto b_dims = base.dims();
+  PADDLE_ENFORCE_EQ(
+      (b_dims.size() == 1) && (b_dims[0] == 1),
+      true,
+      phi::errors::InvalidArgument("The shape of Input(Base) must be [1],"
+                                   "but received input shape is [%s].",
+                                   b_dims));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(start.dtype());
+}
+
 void MeshgridInferMeta(const std::vector<const MetaTensor*>& inputs,
                        std::vector<MetaTensor*> outputs) {
   const size_t inputs_num = inputs.size();
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 557855219bb51..65b5819b602ba 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -228,6 +228,12 @@ void InterpolateInferMeta(
     MetaTensor* output,
     MetaConfig config = MetaConfig());
 
+void LogspaceInferMeta(const MetaTensor& start,
+                       const MetaTensor& stop,
+                       const MetaTensor& number,
+                       const MetaTensor& base,
+                       MetaTensor* out);
+
 void MeshgridInferMeta(const std::vector<const MetaTensor*>& inputs,
                        std::vector<MetaTensor*> outputs);
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 7b50a37ac149f..e3e1211e3ece8 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1284,24 +1284,33 @@ void Pad3dInferMeta(const MetaTensor& x,
                         "5, but received %d. ",
                         x_dim.size()));
 
-  std::vector<int64_t> out_dims(x_dim.size());
+  std::vector<int64_t> out_dims(x_dim.size(), -1);
   out_dims[0] = x_dim[0];
+  auto& paddings = paddings_int_array.GetData();
+  if (data_format == "NCDHW") {
+    out_dims[1] = x_dim[1];
+  } else {
+    out_dims[4] = x_dim[4];
+  }
   if (paddings_int_array.FromTensor()) {
     if (config.is_runtime) {
       PADDLE_ENFORCE_EQ(
-          paddings_int_array.GetData().size(),
+          paddings.size(),
           6,
           errors::InvalidArgument("Shape of Input(Paddings) should be equal to "
                                   "[6], but received [%d].",
-                                  paddings_int_array.GetData().size()));
+                                  paddings.size()));
+      if (data_format == "NCDHW") {
+        out_dims[2] = x_dim[2] + paddings[4] + paddings[5];
+        out_dims[3] = x_dim[3] + paddings[2] + paddings[3];
+        out_dims[4] = x_dim[4] + paddings[0] + paddings[1];
+      } else {
+        out_dims[1] = x_dim[1] + paddings[4] + paddings[5];
+        out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+        out_dims[3] = x_dim[3] + paddings[0] + paddings[1];
+      }
     }
-    out_dims[1] = x_dim[1];
-    out_dims[2] = x_dim[2];
-    out_dims[3] = x_dim[3];
-    out_dims[4] = x_dim[4];
   } else {
-    auto paddings = paddings_int_array.GetData();
-
     PADDLE_ENFORCE_EQ(
         paddings.size(),
         6,
@@ -1309,7 +1318,6 @@ void Pad3dInferMeta(const MetaTensor& x,
             "Size of paddings should be equal to 6, but received %d.",
             static_cast<int>(paddings.size())));
     if (data_format == "NCDHW") {
-      out_dims[1] = x_dim[1];  // channel
       out_dims[2] = ((!config.is_runtime) && (x_dim[2] < 0))
                         ? x_dim[2]
                         : (x_dim[2] + paddings[4] + paddings[5]);  // depth
@@ -1322,8 +1330,6 @@ void Pad3dInferMeta(const MetaTensor& x,
                         ? x_dim[4]
                         : (x_dim[4] + paddings[0] + paddings[1]);  // width
     } else {                                                       // NDHWC
-      out_dims[4] = x_dim[4];                                      // channel
-
       out_dims[1] = ((!config.is_runtime) && (x_dim[1] < 0))
                         ? x_dim[1]
                         : (x_dim[1] + paddings[4] + paddings[5]);  // depth
diff --git a/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
new file mode 100644
index 0000000000000..f8a89b997b413
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_add_grad_kernel.cc
@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void AddGradFunc(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+        dev_ctx,
+        x,
+        y,
+        out,
+        dout,
+        axis,
+        dx,
+        dy,
+        IdentityGrad<T>(),
+        IdentityGrad<T>());
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout) {
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  phi::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddTripleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_add_kernel.cc b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
new file mode 100644
index 0000000000000..6070264547249
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_add_kernel.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Add
+DEFINE_CPU_ELEMENTWISE_OP(Add)
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(add_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
new file mode 100644
index 0000000000000..b6541ec0e6818
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
+      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(divide_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
new file mode 100644
index 0000000000000..d380621818b35
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
+  // allocate memory for out
+  dev_ctx.template Alloc<T>(out);
+  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
+    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
+    } else {
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(divide_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 804b6449876e5..ee384cc75193c 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -23,118 +23,6 @@
 
 namespace phi {
 
-template <typename T>
-void AddGradFunc(const CPUContext& dev_ctx,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 const DenseTensor& out,
-                 const DenseTensor& dout,
-                 DenseTensor* dx,
-                 DenseTensor* dy,
-                 int axis = -1) {
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
-  } else {
-    ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
-        dev_ctx,
-        x,
-        y,
-        out,
-        dout,
-        axis,
-        dx,
-        dy,
-        IdentityGrad<T>(),
-        IdentityGrad<T>());
-  }
-}
-
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy) {
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
-      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  // skip out
-  auto* out = &dout;
-  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
-}
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  phi::funcs::ElemwiseGradCompute<Context, T, DivGradDX<T>, DivGradDY<T>>(
-      dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
-}
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  auto* out = &dout;  // out is not necessary
-  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
-      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
-}
-
 template <typename T, typename Context>
 void MaximumGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -163,129 +51,6 @@ void MinimumGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(add_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddDoubleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_triple_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddTripleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractDoubleGradKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_double_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_triple_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyTripleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
 PD_REGISTER_KERNEL(fmax_grad,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 0cd236c9a8f04..286b0d0ffaad9 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -21,54 +21,6 @@
 
 namespace phi {
 
-#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
-  template <typename T, typename Context>                                   \
-  void name##RawKernel(const Context& dev_ctx,                              \
-                       const DenseTensor& x,                                \
-                       const DenseTensor& y,                                \
-                       int axis,                                            \
-                       DenseTensor* out) {                                  \
-    dev_ctx.template Alloc<T>(out);                                         \
-    if (x.dims() == y.dims()) {                                             \
-      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
-          dev_ctx, x, y, out);                                              \
-    } else {                                                                \
-      auto x_dims = x.dims();                                               \
-      auto y_dims = y.dims();                                               \
-      if (x_dims.size() >= y_dims.size()) {                                 \
-        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
-            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
-      } else {                                                              \
-        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
-            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
-      }                                                                     \
-    }                                                                       \
-  }
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out) {
-  // allocate memory for out
-  dev_ctx.template Alloc<T>(out);
-  if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
-    SameDimsElementwiseCompute<SameDimsDivideFunctor<CPUContext, T>>()(
-        dev_ctx, x, y, out);
-  } else {
-    auto x_dims = x.dims();
-    auto y_dims = y.dims();
-    if (x_dims.size() >= y_dims.size()) {
-      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
-    } else {
-      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
-          dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
-    }
-  }
-}
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -155,15 +107,6 @@ void ElementwiseHeavisideRawKernel(const Context& dev_ctx,
       dev_ctx, x, y, axis, funcs::ElementwiseHeavisideFunctor<T>(), out);
 }
 
-// Create the definition of Add
-DEFINE_CPU_ELEMENTWISE_OP(Add)
-
-// Create the definition of Subtract
-DEFINE_CPU_ELEMENTWISE_OP(Subtract)
-
-// Create the definition of Multiply
-DEFINE_CPU_ELEMENTWISE_OP(Multiply)
-
 }  // namespace phi
 
 using complex64 = ::phi::dtype::complex<float>;
@@ -178,51 +121,6 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, CPU, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(add_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(maximum_raw,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
new file mode 100644
index 0000000000000..6055541c805f0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_grad_kernel.cc
@@ -0,0 +1,79 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  auto* out = &dout;  // out is not necessary
+  phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+      dev_ctx, x, y, *out, dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
new file mode 100644
index 0000000000000..2424a5330109c
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_multiply_kernel.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Multiply
+DEFINE_CPU_ELEMENTWISE_OP(Multiply)
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
new file mode 100644
index 0000000000000..c785eacb9a8bc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_grad_kernel.cc
@@ -0,0 +1,75 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_subtract_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  // skip out
+  auto* out = &dout;
+  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+}
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout) {
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(subtract_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(subtract_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractDoubleGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
new file mode 100644
index 0000000000000..0e97852ac33e1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_subtract_kernel.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Subtract
+DEFINE_CPU_ELEMENTWISE_OP(Subtract)
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::phi::dtype::bfloat16;
+
+PD_REGISTER_KERNEL(subtract_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
index e18848af0dc08..b4321a85ab2ee 100644
--- a/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_sample_neighbors_kernel.cc
@@ -39,17 +39,42 @@ void SampleUniqueNeighbors(
   }
 }
 
+template <class bidiiter>
+void SampleUniqueNeighborsWithEids(
+    bidiiter src_begin,
+    bidiiter src_end,
+    bidiiter eid_begin,
+    bidiiter eid_end,
+    int num_samples,
+    std::mt19937& rng,
+    std::uniform_int_distribution<int>& dice_distribution) {
+  int left_num = std::distance(src_begin, src_end);
+  for (int i = 0; i < num_samples; i++) {
+    bidiiter r1 = src_begin, r2 = eid_begin;
+    int random_step = dice_distribution(rng) % left_num;
+    std::advance(r1, random_step);
+    std::advance(r2, random_step);
+    std::swap(*src_begin, *r1);
+    std::swap(*eid_begin, *r2);
+    ++src_begin;
+    ++eid_begin;
+    --left_num;
+  }
+}
+
 template <typename T>
 void SampleNeighbors(const T* row,
                      const T* col_ptr,
+                     const T* eids,
                      const T* input,
                      std::vector<T>* output,
                      std::vector<int>* output_count,
+                     std::vector<T>* output_eids,
                      int sample_size,
-                     int bs) {
-  // Allocate the memory of output
-  // Collect the neighbors size
+                     int bs,
+                     bool return_eids) {
   std::vector<std::vector<T>> out_src_vec;
+  std::vector<std::vector<T>> out_eids_vec;
   // `sample_cumsum_sizes` record the start position and end position
   // after sampling.
   std::vector<int> sample_cumsum_sizes(bs + 1);
@@ -65,10 +90,18 @@ void SampleNeighbors(const T* row,
     std::vector<T> out_src;
     out_src.resize(cap);
     out_src_vec.emplace_back(out_src);
+    if (return_eids) {
+      std::vector<T> out_eids;
+      out_eids.resize(cap);
+      out_eids_vec.emplace_back(out_eids);
+    }
   }
 
   output_count->resize(bs);
   output->resize(total_neighbors);
+  if (return_eids) {
+    output_eids->resize(total_neighbors);
+  }
 
   std::random_device rd;
   std::mt19937 rng{rd()};
@@ -85,15 +118,28 @@ void SampleNeighbors(const T* row,
     int cap = end - begin;
     if (sample_size < cap) {
       std::copy(row + begin, row + end, out_src_vec[i].begin());
-      // TODO(daisiming): Check whether is correct.
-      SampleUniqueNeighbors(out_src_vec[i].begin(),
-                            out_src_vec[i].end(),
-                            sample_size,
-                            rng,
-                            dice_distribution);
+      if (return_eids) {
+        std::copy(eids + begin, eids + end, out_eids_vec[i].begin());
+        SampleUniqueNeighborsWithEids(out_src_vec[i].begin(),
+                                      out_src_vec[i].end(),
+                                      out_eids_vec[i].begin(),
+                                      out_eids_vec[i].end(),
+                                      sample_size,
+                                      rng,
+                                      dice_distribution);
+      } else {
+        SampleUniqueNeighbors(out_src_vec[i].begin(),
+                              out_src_vec[i].end(),
+                              sample_size,
+                              rng,
+                              dice_distribution);
+      }
       *(output_count->data() + i) = sample_size;
     } else {
       std::copy(row + begin, row + end, out_src_vec[i].begin());
+      if (return_eids) {
+        std::copy(eids + begin, eids + end, out_eids_vec[i].begin());
+      }
       *(output_count->data() + i) = cap;
     }
   }
@@ -107,6 +153,11 @@ void SampleNeighbors(const T* row,
     std::copy(out_src_vec[i].begin(),
               out_src_vec[i].begin() + k,
               output->data() + sample_cumsum_sizes[i]);
+    if (return_eids) {
+      std::copy(out_eids_vec[i].begin(),
+                out_eids_vec[i].begin() + k,
+                output_eids->data() + sample_cumsum_sizes[i]);
+    }
   }
 }
 
@@ -131,8 +182,35 @@ void GraphSampleNeighborsKernel(
 
   std::vector<T> output;
   std::vector<int> output_count;
-  SampleNeighbors<T>(
-      row_data, col_ptr_data, x_data, &output, &output_count, sample_size, bs);
+
+  if (return_eids) {
+    const T* eids_data = eids.get_ptr()->data<T>();
+    std::vector<T> output_eids;
+    SampleNeighbors<T>(row_data,
+                       col_ptr_data,
+                       eids_data,
+                       x_data,
+                       &output,
+                       &output_count,
+                       &output_eids,
+                       sample_size,
+                       bs,
+                       return_eids);
+    out_eids->Resize({static_cast<int>(output_eids.size())});
+    T* out_eids_data = dev_ctx.template Alloc<T>(out_eids);
+    std::copy(output_eids.begin(), output_eids.end(), out_eids_data);
+  } else {
+    SampleNeighbors<T>(row_data,
+                       col_ptr_data,
+                       nullptr,
+                       x_data,
+                       &output,
+                       &output_count,
+                       nullptr,
+                       sample_size,
+                       bs,
+                       return_eids);
+  }
   out->Resize({static_cast<int>(output.size())});
   T* out_data = dev_ctx.template Alloc<T>(out);
   std::copy(output.begin(), output.end(), out_data);
diff --git a/paddle/phi/kernels/cpu/logspace_kernel.cc b/paddle/phi/kernels/cpu/logspace_kernel.cc
new file mode 100644
index 0000000000000..fbb31057a35ae
--- /dev/null
+++ b/paddle/phi/kernels/cpu/logspace_kernel.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logspace_kernel.h"
+
+#include <cmath>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    const DenseTensor& base,
+                    DataType dtype,
+                    DenseTensor* out) {
+  int32_t num = number.data<int32_t>()[0];
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+  auto base_t = phi::funcs::TransDataType(ctx, base, dtype);
+
+  T start_data = start_t.template data<T>()[0];
+  T stop_data = stop_t.template data<T>()[0];
+  T base_data = base_t.template data<T>()[0];
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of logspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  if (num > 1) {
+    // step should be of double type for all types
+    double step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    int half_num = num / 2;
+    for (int i = 0; i < num; ++i) {
+      if (i < half_num) {
+        out_data[i] =
+            static_cast<T>(std::pow(base_data, start_data + step * i));
+      } else {
+        out_data[i] = static_cast<T>(
+            std::pow(base_data, stop_data - step * (num - i - 1)));
+      }
+    }
+  } else {
+    out_data[0] = static_cast<T>(std::pow(base_data, start_data));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(logspace,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LogspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
index 77c763171088c..3bfc07319e98d 100644
--- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
+++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc
@@ -17,7 +17,7 @@
 #include <Eigen/Dense>
 #include <Eigen/SVD>
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/elementwise_add_grad_kernel.h b/paddle/phi/kernels/elementwise_add_grad_kernel.h
new file mode 100644
index 0000000000000..9b754cfefe365
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_add_grad_kernel.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy);
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout);
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_add_kernel.h b/paddle/phi/kernels/elementwise_add_kernel.h
new file mode 100644
index 0000000000000..3245c450aaebe
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_add_kernel.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_divide_grad_kernel.h b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
new file mode 100644
index 0000000000000..6d29dae99a131
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_divide_grad_kernel.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy);
+
+template <typename T, typename Context>
+void DivideDoubleGradKernel(const Context& dev_ctx,
+                            const DenseTensor& y,
+                            const DenseTensor& out,
+                            const DenseTensor& dx,
+                            paddle::optional<const DenseTensor&> ddx,
+                            paddle::optional<const DenseTensor&> ddy,
+                            int axis,
+                            DenseTensor* dy,
+                            DenseTensor* dout,
+                            DenseTensor* ddout);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_divide_kernel.h b/paddle/phi/kernels/elementwise_divide_kernel.h
new file mode 100644
index 0000000000000..5555b69fde1de
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_divide_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h
index a1c806beb9a40..b1e6ecaee6746 100644
--- a/paddle/phi/kernels/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/elementwise_grad_kernel.h
@@ -19,111 +19,6 @@ limitations under the License. */
 
 namespace phi {
 
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy);
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout);
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy);
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy);
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout);
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy);
-
-template <typename T, typename Context>
-void DivideDoubleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& y,
-                            const DenseTensor& out,
-                            const DenseTensor& dx,
-                            paddle::optional<const DenseTensor&> ddx,
-                            paddle::optional<const DenseTensor&> ddy,
-                            int axis,
-                            DenseTensor* dy,
-                            DenseTensor* dout,
-                            DenseTensor* ddout);
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy);
-
-template <typename T, typename Context>
-void MultiplyDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              int axis,
-                              DenseTensor* dx,
-                              DenseTensor* dy,
-                              DenseTensor* ddout);
-
-template <typename T, typename Context>
-void MultiplyTripleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& x,
-                              const DenseTensor& y,
-                              const DenseTensor& dout,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& d_dx,
-                              const DenseTensor& d_dy,
-                              paddle::optional<const DenseTensor&> d_ddout,
-                              int axis,
-                              DenseTensor* d_x,
-                              DenseTensor* d_y,
-                              DenseTensor* d_dout,
-                              DenseTensor* d_ddx,
-                              DenseTensor* d_ddy);
-
 template <typename T, typename Context>
 void ElementwiseFMaxGradKernel(const Context& dev_ctx,
                                const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc
index 8c7a8c88f4630..a8aebd952af50 100644
--- a/paddle/phi/kernels/elementwise_kernel.cc
+++ b/paddle/phi/kernels/elementwise_kernel.cc
@@ -19,42 +19,6 @@
 
 namespace phi {
 
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out) {
-  int axis = -1;
-  AddRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  int axis = -1;
-  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out) {
-  int axis = -1;
-  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  int axis = -1;
-  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
-}
-
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -114,51 +78,6 @@ void ElementwiseHeavisideKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PD_REGISTER_KERNEL(add,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(maximum,
                    CPU,
                    ALL_LAYOUT,
@@ -200,57 +119,6 @@ PD_REGISTER_KERNEL(elementwise_pow,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-PD_REGISTER_KERNEL(add,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   complex64,
-                   complex128,
-                   phi::dtype::bfloat16) {}
-PD_REGISTER_KERNEL(divide,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
 PD_REGISTER_KERNEL(maximum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h
index da888a68b137c..a39da52e7e3b5 100644
--- a/paddle/phi/kernels/elementwise_kernel.h
+++ b/paddle/phi/kernels/elementwise_kernel.h
@@ -33,58 +33,6 @@ void FMinKernel(const Context& dev_ctx,
                 int axis,
                 DenseTensor* out);
 
-template <typename T, typename Context>
-void AddRawKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideRawKernel(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis,
-                     DenseTensor* out);
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyRawKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       int axis,
-                       DenseTensor* out);
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out);
-
 template <typename T, typename Context>
 void MaximumRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -163,50 +111,6 @@ void ElementwiseHeavisideKernel(const Context& dev_ctx,
                                 const DenseTensor& y,
                                 DenseTensor* out);
 
-template <typename T, typename Context>
-DenseTensor Add(const Context& dev_ctx,
-                const DenseTensor& x,
-                const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Subtract(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Divide(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
-template <typename T, typename Context>
-DenseTensor Multiply(const Context& dev_ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  ElementwiseInferMeta(x, y, &meta_out);
-  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename Context>
 DenseTensor Maximum(const Context& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/phi/kernels/elementwise_multiply_grad_kernel.h b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
new file mode 100644
index 0000000000000..517948a50d1b1
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_multiply_grad_kernel.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void MultiplyDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              int axis,
+                              DenseTensor* dx,
+                              DenseTensor* dy,
+                              DenseTensor* ddout);
+
+template <typename T, typename Context>
+void MultiplyTripleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              const DenseTensor& dout,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& d_dx,
+                              const DenseTensor& d_dy,
+                              paddle::optional<const DenseTensor&> d_ddout,
+                              int axis,
+                              DenseTensor* d_x,
+                              DenseTensor* d_y,
+                              DenseTensor* d_dout,
+                              DenseTensor* d_ddx,
+                              DenseTensor* d_ddy);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_multiply_kernel.h b/paddle/phi/kernels/elementwise_multiply_kernel.h
new file mode 100644
index 0000000000000..608ae95d2ba4b
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_multiply_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_subtract_grad_kernel.h b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
new file mode 100644
index 0000000000000..7be91b4b9f4cd
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_subtract_grad_kernel.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy);
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/elementwise_subtract_kernel.h b/paddle/phi/kernels/elementwise_subtract_kernel.h
new file mode 100644
index 0000000000000..1f6c4383df5d8
--- /dev/null
+++ b/paddle/phi/kernels/elementwise_subtract_kernel.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y) {
+  DenseTensor dense_out;
+  MetaTensor meta_out(&dense_out);
+  ElementwiseInferMeta(x, y, &meta_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index d71a61f107a7a..14a9560b841fa 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/phi/core/hostdevice.h"
+#if defined(__xpu__)
+#define CHAR_BIT 8
+#endif
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index b414dfc5d6e84..42fee14488373 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -33,10 +33,14 @@
 namespace cub = hipcub;
 #endif
 
+#ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#endif
+
+#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/array.h"
@@ -183,7 +187,7 @@ struct IndexCalculator {
     strides = details::VectorToArray<int, kMaxRank>(full_strides);
     reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
 #ifndef PADDLE_WITH_XPU_KP
-    std::vector<kps::details::FastDivMod> cal_divmoders;  // namespace
+    std::vector<kps::details::FastDivMod> cal_divmoders;
     // fast divmod
     for (auto i : cal_strides) {
       cal_divmoders.push_back(kps::details::FastDivMod(i));
@@ -325,9 +329,10 @@ struct ReduceConfig {
 
     // step4: set the block and grid for launch kernel
     SetBlockDim();
-
+#ifndef PADDLE_WITH_XPU_KP
     // step5: limit the grid to prevent thead overflow
     paddle::platform::LimitGridDim(dev_ctx, &grid);
+#endif
   }
 
   // when should_reduce_again is true, we need malloc temp space for temp data
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
new file mode 100644
index 0000000000000..26b8549aaafdc
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+template <typename IntT>
+__global__ void FlattenIndicesKernel(const IntT* indices,
+                                     const IntT* sparse_offsets,
+                                     const int64_t non_zero_num,
+                                     const int64_t sparse_dim,
+                                     IntT* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  phi::funcs::sparse::FlattenIndices<IntT>(indices,
+                                           sparse_offsets,
+                                           non_zero_num,
+                                           sparse_dim,
+                                           tid,
+                                           gridDim.x * blockDim.x,
+                                           out);
+}
+
+template <typename IntT>
+__global__ void IndexToCoordinateKernel(const IntT* indexs,
+                                        const Dim<DDim::kMaxRank> dims,
+                                        const int64_t non_zero_num,
+                                        const int64_t sparse_dim,
+                                        IntT* indices) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  IndexToCoordinate(indexs,
+                    dims,
+                    non_zero_num,
+                    sparse_dim,
+                    tid,
+                    gridDim.x * blockDim.x,
+                    indices);
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
new file mode 100644
index 0000000000000..ca212e4366ec4
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+template <typename IntT>
+inline const IntT HOSTDEVICE CoordinateToIndex(const IntT* indices,
+                                               const IntT* sparse_offsets,
+                                               const int64_t non_zero_num,
+                                               const int64_t sparse_dim,
+                                               const int i) {
+  IntT index = 0;
+  for (IntT j = 0; j < sparse_dim; j++) {
+    index += indices[j * non_zero_num + i] * sparse_offsets[j];
+  }
+  return index;
+}
+
+template <typename IntT>
+inline void HOSTDEVICE FlattenIndices(const IntT* indices,
+                                      const IntT* sparse_offsets,
+                                      const int64_t non_zero_num,
+                                      const int64_t sparse_dim,
+                                      const int64_t start,
+                                      const int64_t stride,
+                                      IntT* out) {
+  for (int64_t i = start; i < non_zero_num; i += stride) {
+    out[i] =
+        CoordinateToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i);
+  }
+}
+
+// 1. indices.dims().size() == 2
+template <typename IntT>
+inline void CalcOffsetsPerDim(const DDim& dims,
+                              const int64_t sparse_dim,
+                              IntT* offsets) {
+  IntT offset = 1;
+  for (IntT i = sparse_dim - 1; i >= 0; i--) {
+    offsets[i] = offset;
+    offset *= dims[i];
+  }
+}
+
+template <typename IntT>
+inline void HOSTDEVICE IndexToCoordinate(const IntT index,
+                                         const Dim<DDim::kMaxRank>& dims,
+                                         const int64_t non_zero_num,
+                                         const int64_t sparse_dim,
+                                         const int indices_offset,
+                                         IntT* indices) {
+  IntT tmp_index = index;
+  for (int j = sparse_dim - 1; j >= 0; j--) {
+    indices[j * non_zero_num + indices_offset] = tmp_index % dims[j];
+    tmp_index /= dims[j];
+  }
+}
+
+template <typename IntT>
+inline void HOSTDEVICE IndexToCoordinate(const IntT* indexs,
+                                         const Dim<DDim::kMaxRank>& dims,
+                                         const int64_t non_zero_num,
+                                         const int64_t sparse_dim,
+                                         const int64_t start,
+                                         const int64_t stride,
+                                         IntT* indices) {
+  for (int64_t i = start; i < non_zero_num; i += stride) {
+    IntT tmp_index = indexs[i];
+    IndexToCoordinate(tmp_index, dims, non_zero_num, sparse_dim, i, indices);
+  }
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
new file mode 100644
index 0000000000000..9ed7cef12a148
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out,
+                              const bool subm = false) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/utils.cu.h b/paddle/phi/kernels/funcs/sparse/utils.cu.h
new file mode 100644
index 0000000000000..074fe1ca42049
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/utils.cu.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+// brief: calculation the distance between start and end
+template <typename T>
+__global__ void DistanceKernel(const T* start, const T* end, T* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 6feee512cc9f4..385ddb5e521a2 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -259,6 +259,7 @@ PD_REGISTER_KERNEL(arg_min,
                    GPU,
                    ALL_LAYOUT,
                    phi::ArgMinKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int32_t,
@@ -270,6 +271,7 @@ PD_REGISTER_KERNEL(arg_max,
                    GPU,
                    ALL_LAYOUT,
                    phi::ArgMaxKernel,
+                   phi::dtype::float16,
                    float,
                    double,
                    int32_t,
diff --git a/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
new file mode 100644
index 0000000000000..8dd4d0184c267
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_add_grad_kernel.cu
@@ -0,0 +1,118 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T>
+void AddGradFunc(const GPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 const DenseTensor& out,
+                 const DenseTensor& dout,
+                 DenseTensor* dx,
+                 DenseTensor* dy,
+                 int axis = -1) {
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
+  } else {
+    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void AddGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& dout,
+                   int axis,
+                   DenseTensor* dx,
+                   DenseTensor* dy) {
+  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
+}
+
+template <typename T, typename Context>
+void AddDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y,
+                         const DenseTensor& dout,
+                         paddle::optional<const DenseTensor&> ddx,
+                         paddle::optional<const DenseTensor&> ddy,
+                         int axis,
+                         DenseTensor* ddout) {
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+template <typename T, typename Context>
+void AddTripleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& ddx,
+                         const DenseTensor& ddy,
+                         const DenseTensor& d_ddout,
+                         int axis,
+                         DenseTensor* d_ddx,
+                         DenseTensor* d_ddy) {
+  phi::AddGradImpl<T>(
+      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(add_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddTripleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
new file mode 100644
index 0000000000000..57bf6da4060d3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_divide_grad_kernel.cu
@@ -0,0 +1,86 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_divide_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void DivideGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& y,
+                      const DenseTensor& out,
+                      const DenseTensor& dout,
+                      int axis,
+                      DenseTensor* dx,
+                      DenseTensor* dy) {
+  const auto place = dev_ctx.GetPlace();
+  if (dx != nullptr && dy != nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXAndYOut<ElementwiseType::kTernary, T>(
+        dev_ctx,
+        place,
+        axis,
+        ins,
+        dout,
+        dx,
+        dy,
+        funcs::DivGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
+  } else if (dy != nullptr && dx == nullptr) {
+    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
+    GetGradXOrYOut<ElementwiseType::kTernary, T>(
+        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(divide_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(divide_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DivideDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index a2b9f868578b8..3e7430fd84eaf 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -26,128 +26,6 @@
 
 namespace phi {
 
-template <typename T>
-void AddGradFunc(const GPUContext& dev_ctx,
-                 const DenseTensor& x,
-                 const DenseTensor& y,
-                 const DenseTensor& out,
-                 const DenseTensor& dout,
-                 DenseTensor* dx,
-                 DenseTensor* dy,
-                 int axis = -1) {
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
-  } else {
-    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
-  }
-}
-
-template <typename T, typename Context>
-void AddGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   const DenseTensor& dout,
-                   int axis,
-                   DenseTensor* dx,
-                   DenseTensor* dy) {
-  phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void AddDoubleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& y,
-                         const DenseTensor& dout,
-                         paddle::optional<const DenseTensor&> ddx,
-                         paddle::optional<const DenseTensor&> ddy,
-                         int axis,
-                         DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void AddTripleGradKernel(const Context& dev_ctx,
-                         const DenseTensor& ddx,
-                         const DenseTensor& ddy,
-                         const DenseTensor& d_ddout,
-                         int axis,
-                         DenseTensor* d_ddx,
-                         DenseTensor* d_ddy) {
-  phi::AddGradImpl<T>(
-      dev_ctx, ddx, ddy, d_ddout, axis, d_ddx, d_ddy, AddGradFunc<T>);
-}
-
-template <typename T, typename Context>
-void SubtractGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  // skip out
-  auto* out = &dout;
-  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy);
-  } else {
-    default_elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
-  }
-}
-
-template <typename T, typename Context>
-void SubtractDoubleGradKernel(const Context& dev_ctx,
-                              const DenseTensor& y,
-                              paddle::optional<const DenseTensor&> ddx,
-                              paddle::optional<const DenseTensor&> ddy,
-                              const DenseTensor& dout,
-                              int axis,
-                              DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
-}
-
-template <typename T, typename Context>
-void DivideGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out,
-                      const DenseTensor& dout,
-                      int axis,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  const auto place = dev_ctx.GetPlace();
-  if (dx != nullptr && dy != nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
-    GetGradXAndYOut<ElementwiseType::kTernary, T>(
-        dev_ctx,
-        place,
-        axis,
-        ins,
-        dout,
-        dx,
-        dy,
-        funcs::DivGradXYFunctor<T, T>());
-  } else if (dx != nullptr && dy == nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &y};
-    GetGradXOrYOut<ElementwiseType::kBinary, T>(
-        dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor<T>());
-  } else if (dy != nullptr && dx == nullptr) {
-    std::vector<const DenseTensor*> ins = {&dout, &out, &y};
-    GetGradXOrYOut<ElementwiseType::kTernary, T>(
-        dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor<T>());
-  }
-}
-
-template <typename T, typename Context>
-void MultiplyGradKernel(const Context& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const DenseTensor& dout,
-                        int axis,
-                        DenseTensor* dx,
-                        DenseTensor* dy) {
-  funcs::ElementwiseGradPreProcess(dout, dx);
-  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
-}
-
 template <typename T, typename Context>
 void MaximumGradKernel(const Context& dev_ctx,
                        const DenseTensor& x,
@@ -211,138 +89,6 @@ void MinimumGradKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PD_REGISTER_KERNEL(add_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(add_triple_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AddTripleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(subtract_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SubtractDoubleGradKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(divide_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::DivideDoubleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_double_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyDoubleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(multiply_triple_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::MultiplyTripleGradKernel,
-                   float,
-                   phi::dtype::float16,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::bfloat16,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(fmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
new file mode 100644
index 0000000000000..3442d7f028539
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_multiply_grad_kernel.cu
@@ -0,0 +1,82 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_multiply_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MultiplyGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  funcs::ElementwiseGradPreProcess(dout, dx);
+  ElementwiseMulGrad<T>(dev_ctx, x, y, dout, dx, dy, axis);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(multiply_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyDoubleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(multiply_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MultiplyTripleGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
new file mode 100644
index 0000000000000..20f3b73e4094f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_subtract_grad_kernel.cu
@@ -0,0 +1,83 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/elementwise_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
+#include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SubtractGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& dout,
+                        int axis,
+                        DenseTensor* dx,
+                        DenseTensor* dy) {
+  // skip out
+  auto* out = &dout;
+  if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
+    elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy);
+  } else {
+    default_elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+  }
+}
+
+template <typename T, typename Context>
+void SubtractDoubleGradKernel(const Context& dev_ctx,
+                              const DenseTensor& y,
+                              paddle::optional<const DenseTensor&> ddx,
+                              paddle::optional<const DenseTensor&> ddy,
+                              const DenseTensor& dout,
+                              int axis,
+                              DenseTensor* ddout) {
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(subtract_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(subtract_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SubtractDoubleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 1757b6b98dbf9..af616963b499a 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -62,9 +62,11 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                              const T* nodes,
                              const T* row,
                              const T* col_ptr,
+                             const T* eids,
                              T* output,
+                             T* output_eids,
                              int* output_ptr,
-                             int* output_idxs) {
+                             bool return_eids) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -94,10 +96,13 @@ __global__ void SampleKernel(const uint64_t rand_seed,
     if (deg <= k) {
       for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) {
         output[out_row_start + idx] = row[in_row_start + idx];
+        if (return_eids) {
+          output_eids[out_row_start + idx] = eids[in_row_start + idx];
+        }
       }
     } else {
       for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) {
-        output_idxs[out_row_start + idx] = idx;
+        output[out_row_start + idx] = idx;
       }
 #ifdef PADDLE_WITH_CUDA
       __syncwarp();
@@ -111,7 +116,7 @@ __global__ void SampleKernel(const uint64_t rand_seed,
 #endif
         if (num < k) {
           atomicMax(reinterpret_cast<unsigned int*>(  // NOLINT
-                        output_idxs + out_row_start + num),
+                        output + out_row_start + num),
                     static_cast<unsigned int>(idx));  // NOLINT
         }
       }
@@ -120,8 +125,11 @@ __global__ void SampleKernel(const uint64_t rand_seed,
 #endif
 
       for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) {
-        T perm_idx = output_idxs[out_row_start + idx] + in_row_start;
+        T perm_idx = output[out_row_start + idx] + in_row_start;
         output[out_row_start + idx] = row[perm_idx];
+        if (return_eids) {
+          output_eids[out_row_start + idx] = eids[perm_idx];
+        }
       }
     }
 
@@ -148,16 +156,17 @@ template <typename T, typename Context>
 void SampleNeighbors(const Context& dev_ctx,
                      const T* row,
                      const T* col_ptr,
+                     const T* eids,
                      const thrust::device_ptr<const T> input,
                      thrust::device_ptr<T> output,
                      thrust::device_ptr<int> output_count,
+                     thrust::device_ptr<T> output_eids,
                      int sample_size,
                      int bs,
-                     int total_sample_num) {
+                     int total_sample_num,
+                     bool return_eids) {
   thrust::device_vector<int> output_ptr;
-  thrust::device_vector<int> output_idxs;
   output_ptr.resize(bs);
-  output_idxs.resize(total_sample_num);
   thrust::exclusive_scan(
       output_count, output_count + bs, output_ptr.begin(), 0);
 
@@ -176,18 +185,26 @@ void SampleNeighbors(const Context& dev_ctx,
       thrust::raw_pointer_cast(input),
       row,
       col_ptr,
+      eids,
       thrust::raw_pointer_cast(output),
+      thrust::raw_pointer_cast(output_eids),
       thrust::raw_pointer_cast(output_ptr.data()),
-      thrust::raw_pointer_cast(output_idxs.data()));
+      return_eids);
 }
 
-template <typename T>
+template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
 __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                                         int k,
                                         const int64_t num_rows,
                                         const T* in_rows,
                                         T* src,
                                         const T* dst_count) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
 #ifdef PADDLE_WITH_HIP
   hiprandState rng;
   hiprand_init(
@@ -197,20 +214,19 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   curand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #endif
-  CUDA_KERNEL_LOOP(out_row, num_rows) {
+
+  while (out_row < last_row) {
     const T row = in_rows[out_row];
     const T in_row_start = dst_count[row];
     const int deg = dst_count[row + 1] - in_row_start;
     int split;
-    T tmp;
-
     if (k < deg) {
       if (deg < 2 * k) {
         split = k;
       } else {
         split = deg - k;
       }
-      for (int idx = deg - 1; idx >= split; idx--) {
+      for (int idx = split + threadIdx.x; idx <= deg - 1; idx += WARP_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
 #else
@@ -222,7 +238,11 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                        static_cast<unsigned long long int>(  //  NOLINT
                            src[in_row_start + idx])));
       }
+#ifdef PADDLE_WITH_CUDA
+      __syncwarp();
+#endif
     }
+    out_row += BLOCK_WARPS;
   }
 }
 
@@ -232,9 +252,12 @@ __global__ void GatherEdge(int k,
                            const T* in_rows,
                            const T* src,
                            const T* dst_count,
+                           const T* eids,
                            T* outputs,
+                           T* output_eids,
                            int* output_ptr,
-                           T* perm_data) {
+                           T* perm_data,
+                           bool return_eids) {
   assert(blockDim.x == WARP_SIZE);
   assert(blockDim.y == BLOCK_WARPS);
 
@@ -250,8 +273,10 @@ __global__ void GatherEdge(int k,
 
     if (deg <= k) {
       for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) {
-        const T in_idx = in_row_start + idx;
-        outputs[out_row_start + idx] = src[in_idx];
+        outputs[out_row_start + idx] = src[in_row_start + idx];
+        if (return_eids) {
+          output_eids[out_row_start + idx] = eids[in_row_start + idx];
+        }
       }
     } else {
       int split = k;
@@ -267,6 +292,10 @@ __global__ void GatherEdge(int k,
       for (int idx = begin + threadIdx.x; idx < end; idx += WARP_SIZE) {
         outputs[out_row_start + idx - begin] =
             src[perm_data[in_row_start + idx]];
+        if (return_eids) {
+          output_eids[out_row_start + idx - begin] =
+              eids[perm_data[in_row_start + idx]];
+        }
       }
     }
     out_row += BLOCK_WARPS;
@@ -277,49 +306,48 @@ template <typename T, typename Context>
 void FisherYatesSampleNeighbors(const Context& dev_ctx,
                                 const T* row,
                                 const T* col_ptr,
+                                const T* eids,
                                 T* perm_data,
                                 const thrust::device_ptr<const T> input,
                                 thrust::device_ptr<T> output,
                                 thrust::device_ptr<int> output_count,
+                                thrust::device_ptr<T> output_eids,
                                 int sample_size,
                                 int bs,
-                                int total_sample_num) {
+                                int total_sample_num,
+                                bool return_eids) {
   thrust::device_vector<int> output_ptr;
   output_ptr.resize(bs);
   thrust::exclusive_scan(
       output_count, output_count + bs, output_ptr.begin(), 0);
 
-#ifdef PADDLE_WITH_HIP
-  int block = 256;
-#else
-  int block = 1024;
-#endif
-  int max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0];
-  int grid_tmp = (bs + block - 1) / block;
-  int grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+  constexpr int WARP_SIZE = 32;
+  constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
+  constexpr int TILE_SIZE = BLOCK_WARPS * 16;
+  const dim3 block(WARP_SIZE, BLOCK_WARPS);
+  const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
 
-  FisherYatesSampleKernel<T><<<grid, block, 0, dev_ctx.stream()>>>(
+  FisherYatesSampleKernel<T,
+                          WARP_SIZE,
+                          BLOCK_WARPS,
+                          TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
       0, sample_size, bs, thrust::raw_pointer_cast(input), perm_data, col_ptr);
 
-  constexpr int GATHER_WARP_SIZE = 32;
-  constexpr int GATHER_BLOCK_WARPS = 128 / GATHER_WARP_SIZE;
-  constexpr int GATHER_TILE_SIZE = GATHER_BLOCK_WARPS * 16;
-  const dim3 gather_block(GATHER_WARP_SIZE, GATHER_BLOCK_WARPS);
-  const dim3 gather_grid((bs + GATHER_TILE_SIZE - 1) / GATHER_TILE_SIZE);
-
-  GatherEdge<
-      T,
-      GATHER_WARP_SIZE,
-      GATHER_BLOCK_WARPS,
-      GATHER_TILE_SIZE><<<gather_grid, gather_block, 0, dev_ctx.stream()>>>(
+  GatherEdge<T,
+             WARP_SIZE,
+             BLOCK_WARPS,
+             TILE_SIZE><<<grid, block, 0, dev_ctx.stream()>>>(
       sample_size,
       bs,
       thrust::raw_pointer_cast(input),
       row,
       col_ptr,
+      eids,
       thrust::raw_pointer_cast(output),
+      thrust::raw_pointer_cast(output_eids),
       thrust::raw_pointer_cast(output_ptr.data()),
-      perm_data);
+      perm_data,
+      return_eids);
 }
 
 template <typename T, typename Context>
@@ -354,32 +382,78 @@ void GraphSampleNeighborsKernel(
   T* out_data = dev_ctx.template Alloc<T>(out);
   thrust::device_ptr<T> output(out_data);
 
-  if (!flag_perm_buffer) {
-    SampleNeighbors<T, Context>(dev_ctx,
-                                row_data,
-                                col_ptr_data,
-                                input,
-                                output,
-                                output_count,
-                                sample_size,
-                                bs,
-                                total_sample_size);
+  if (return_eids) {
+    auto* eids_data = eids.get_ptr()->data<T>();
+    out_eids->Resize({static_cast<int>(total_sample_size)});
+    T* out_eids_data = dev_ctx.template Alloc<T>(out_eids);
+    thrust::device_ptr<T> output_eids(out_eids_data);
+    if (!flag_perm_buffer) {
+      SampleNeighbors<T, Context>(dev_ctx,
+                                  row_data,
+                                  col_ptr_data,
+                                  eids_data,
+                                  input,
+                                  output,
+                                  output_count,
+                                  output_eids,
+                                  sample_size,
+                                  bs,
+                                  total_sample_size,
+                                  return_eids);
+    } else {
+      DenseTensor perm_buffer_out(perm_buffer->type());
+      const auto* p_perm_buffer = perm_buffer.get_ptr();
+      perm_buffer_out.ShareDataWith(*p_perm_buffer);
+      T* perm_buffer_out_data = perm_buffer_out.template data<T>();
+      FisherYatesSampleNeighbors<T, Context>(dev_ctx,
+                                             row_data,
+                                             col_ptr_data,
+                                             eids_data,
+                                             perm_buffer_out_data,
+                                             input,
+                                             output,
+                                             output_count,
+                                             output_eids,
+                                             sample_size,
+                                             bs,
+                                             total_sample_size,
+                                             return_eids);
+    }
   } else {
-    DenseTensor perm_buffer_out(perm_buffer->type());
-    const auto* p_perm_buffer = perm_buffer.get_ptr();
-    perm_buffer_out.ShareDataWith(*p_perm_buffer);
-    T* perm_buffer_out_data =
-        perm_buffer_out.mutable_data<T>(dev_ctx.GetPlace());
-    FisherYatesSampleNeighbors<T, Context>(dev_ctx,
-                                           row_data,
-                                           col_ptr_data,
-                                           perm_buffer_out_data,
-                                           input,
-                                           output,
-                                           output_count,
-                                           sample_size,
-                                           bs,
-                                           total_sample_size);
+    // How to set null value for output_eids(thrust::device_ptr<T>)?
+    // We use `output` to fill the position of unused output_eids.
+    if (!flag_perm_buffer) {
+      SampleNeighbors<T, Context>(dev_ctx,
+                                  row_data,
+                                  col_ptr_data,
+                                  nullptr,
+                                  input,
+                                  output,
+                                  output_count,
+                                  output,
+                                  sample_size,
+                                  bs,
+                                  total_sample_size,
+                                  return_eids);
+    } else {
+      DenseTensor perm_buffer_out(perm_buffer->type());
+      const auto* p_perm_buffer = perm_buffer.get_ptr();
+      perm_buffer_out.ShareDataWith(*p_perm_buffer);
+      T* perm_buffer_out_data = perm_buffer_out.template data<T>();
+      FisherYatesSampleNeighbors<T, Context>(dev_ctx,
+                                             row_data,
+                                             col_ptr_data,
+                                             nullptr,
+                                             perm_buffer_out_data,
+                                             input,
+                                             output,
+                                             output_count,
+                                             output,
+                                             sample_size,
+                                             bs,
+                                             total_sample_size,
+                                             return_eids);
+    }
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu
new file mode 100644
index 0000000000000..f47b7d35cdcda
--- /dev/null
+++ b/paddle/phi/kernels/gpu/logspace_kernel.cu
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logspace_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/data_type_transform.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void LogspaceKernelInner(
+    T start, T stop, double step, T base, int64_t size, T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] =
+          static_cast<T>(pow(static_cast<double>(base),
+                             static_cast<double>(start + step * index)));
+    } else {
+      out[index] = static_cast<T>(
+          pow(static_cast<double>(base),
+              static_cast<double>(stop - step * (size - index - 1))));
+    }
+  }
+}
+
+template <typename T>
+__global__ void LogspaceSpecialKernel(T start, T base, T* out) {
+  out[0] = static_cast<T>(
+      pow(static_cast<double>(base), static_cast<double>(start)));
+}
+
+template <typename T, typename Context>
+void LogspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    const DenseTensor& base,
+                    DataType dtype,
+                    DenseTensor* out) {
+  auto start_t = phi::funcs::TransDataType(ctx, start, dtype);
+  auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype);
+  auto base_t = phi::funcs::TransDataType(ctx, base, dtype);
+
+  DenseTensor n_start;
+  DenseTensor n_stop;
+  DenseTensor n_num;
+  DenseTensor n_base;
+  phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start);
+  T start_data = n_start.data<T>()[0];
+  phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop);
+  T stop_data = n_stop.data<T>()[0];
+  phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num);
+  int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+  phi::Copy(ctx, base_t, phi::CPUPlace(), false, &n_base);
+  T base_data = n_base.data<T>()[0];
+
+  PADDLE_ENFORCE_GT(
+      num,
+      0,
+      phi::errors::InvalidArgument("The num of logspace op should be larger "
+                                   "than 0, but received num is %d",
+                                   num));
+
+  out->Resize(phi::make_ddim({num}));
+  T* out_data = ctx.template Alloc<T>(out);
+
+  double step = 0;
+  auto stream = ctx.stream();
+  int block = 512;
+  int grid = (num + block - 1) / block;
+  if (num != 1) {
+    step = (static_cast<double>(stop_data - start_data)) / (num - 1);
+    LogspaceKernelInner<T><<<grid, block, 0, stream>>>(
+        start_data, stop_data, step, base_data, num, out_data);
+  } else {
+    LogspaceSpecialKernel<T><<<grid, block, 0, stream>>>(
+        start_data, base_data, out_data);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(logspace,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogspaceKernel,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index be6cdc7825575..84768866cc9e7 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -23,7 +23,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/compare_functors.h"
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 6fb81edd6bf47..7f6ecef80879f 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -41,7 +41,7 @@ void Reduce(const KPDevice& dev_ctx,
   for (auto i : reduce_dims) {
     reduce_num *= (x.dims())[i];
   }
-
+#ifndef PADDLE_WITH_XPU_KP
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);
     PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(
@@ -73,6 +73,16 @@ void Reduce(const KPDevice& dev_ctx,
         reduce_dims,
         is_mean);
   }
+#else
+  using MPType = typename kps::details::MPTypeTrait<T>::Type;
+  phi::funcs::ReduceKernel<T, T, ReduceOp, TransformOp<T, MPType>>(
+      dev_ctx,
+      x,
+      out,
+      TransformOp<T, MPType>(reduce_num),
+      reduce_dims,
+      is_mean);
+#endif
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 8c37091ef1b54..371644e6434a4 100644
--- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index e4356e9af3937..ab1c33d50a456 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/kernels/determinant_grad_kernel.h"
 
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
index 5e06435b28e27..f39786fff2665 100644
--- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h
@@ -16,7 +16,9 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 #include "paddle/phi/kernels/funcs/diag_functor.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index d5c2c559b2c06..b126ca9b84227 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -22,6 +22,48 @@
 #endif
 
 namespace phi {
+
+#define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
+  template <typename T, typename Context>                                   \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
+    dev_ctx.template Alloc<T>(out);                                         \
+    if (x.dims() == y.dims()) {                                             \
+      SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
+          dev_ctx, x, y, out);                                              \
+    } else {                                                                \
+      auto x_dims = x.dims();                                               \
+      auto y_dims = y.dims();                                               \
+      if (x_dims.size() >= y_dims.size()) {                                 \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
+            dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
+      } else {                                                              \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
+            dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
+  template <typename T, typename Context>                            \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
+    std::vector<const DenseTensor*> inputs;                          \
+    std::vector<DenseTensor*> outputs;                               \
+    inputs.emplace_back(&x);                                         \
+    inputs.emplace_back(&y);                                         \
+    outputs.emplace_back(out);                                       \
+    dev_ctx.template Alloc<T>(out);                                  \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
+        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
+  }
+
 template <typename T, typename Context>
 void FMaxKernel(const Context& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
new file mode 100644
index 0000000000000..b5532c614314f
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+DEFINE_CUDA_ELEMENTWISE_OP(Add)
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(add_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
new file mode 100644
index 0000000000000..852babe29dbf7
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Divide
+DEFINE_CUDA_ELEMENTWISE_OP(Divide)
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(divide_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(divide,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index b282cb6bf70ed..b2c16fa4356d1 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -22,34 +22,6 @@
 
 namespace phi {
 
-#define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
-  template <typename T, typename Context>                            \
-  void name##RawKernel(const Context& dev_ctx,                       \
-                       const DenseTensor& x,                         \
-                       const DenseTensor& y,                         \
-                       int axis,                                     \
-                       DenseTensor* out) {                           \
-    std::vector<const DenseTensor*> inputs;                          \
-    std::vector<DenseTensor*> outputs;                               \
-    inputs.emplace_back(&x);                                         \
-    inputs.emplace_back(&y);                                         \
-    outputs.emplace_back(out);                                       \
-    dev_ctx.template Alloc<T>(out);                                  \
-    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
-        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
-  }
-
-/**
- * Kernels
- */
-// Create the definition of Add
-DEFINE_CUDA_ELEMENTWISE_OP(Add)
-// Create the definition of Subtract
-DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
-// Create the definition of Multiply
-DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
-// Create the definition of Divide
-DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 // Create the definition of Maximum
 DEFINE_CUDA_ELEMENTWISE_OP(Maximum)
 // Create the definition of Minimum
@@ -66,12 +38,6 @@ DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow)
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(add_raw, KPS, ALL_LAYOUT, phi::AddRawKernel, float) {}
-PD_REGISTER_KERNEL(
-    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
-PD_REGISTER_KERNEL(divide_raw, KPS, ALL_LAYOUT, phi::DivideRawKernel, float) {}
-PD_REGISTER_KERNEL(
-    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
 PD_REGISTER_KERNEL(maximum_raw, KPS, ALL_LAYOUT, phi::MaximumRawKernel, float) {
 }
 PD_REGISTER_KERNEL(minimum_raw, KPS, ALL_LAYOUT, phi::MinimumRawKernel, float) {
@@ -91,57 +57,6 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     fmin, KPS, ALL_LAYOUT, phi::FMinKernel, float, double, int, int64_t) {}
 
-PD_REGISTER_KERNEL(add_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::AddRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(subtract_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::SubtractRawKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(divide_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::DivideRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-PD_REGISTER_KERNEL(multiply_raw,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MultiplyRawKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
 PD_REGISTER_KERNEL(maximum_raw,
                    KPS,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
new file mode 100644
index 0000000000000..8bede0198c2fa
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Multiply
+DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(
+    multiply_raw, KPS, ALL_LAYOUT, phi::MultiplyRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(multiply_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
+PD_REGISTER_KERNEL(multiply,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
new file mode 100644
index 0000000000000..757dedb99c931
--- /dev/null
+++ b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#endif
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
+
+namespace phi {
+
+// Create the definition of Subtract
+DEFINE_CUDA_ELEMENTWISE_OP(Subtract)
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(
+    subtract_raw, KPS, ALL_LAYOUT, phi::SubtractRawKernel, float) {}
+#else
+
+using float16 = phi::dtype::float16;
+using bfloat16 = phi::dtype::bfloat16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+PD_REGISTER_KERNEL(subtract_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractRawKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+PD_REGISTER_KERNEL(subtract,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   complex64,
+                   complex128,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/reduce_all_kernel.cu b/paddle/phi/kernels/kps/reduce_all_kernel.cu
similarity index 87%
rename from paddle/phi/kernels/gpu/reduce_all_kernel.cu
rename to paddle/phi/kernels/kps/reduce_all_kernel.cu
index 2963d3f206c2d..dc6355a213ffb 100644
--- a/paddle/phi/kernels/gpu/reduce_all_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_all_kernel.cu
@@ -33,4 +33,8 @@ void AllRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(all_raw, KPS, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+#else
+PD_REGISTER_KERNEL(all_raw, KPS, ALL_LAYOUT, phi::AllRawKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
similarity index 87%
rename from paddle/phi/kernels/gpu/reduce_max_kernel.cu
rename to paddle/phi/kernels/kps/reduce_max_kernel.cu
index 98c3986c51dd6..dd63b05bda1fb 100644
--- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -33,5 +33,10 @@ void MaxRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {}
+#else
 PD_REGISTER_KERNEL(
-    max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+    max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/reduce_mean_kernel.cu b/paddle/phi/kernels/kps/reduce_mean_kernel.cu
similarity index 91%
rename from paddle/phi/kernels/gpu/reduce_mean_kernel.cu
rename to paddle/phi/kernels/kps/reduce_mean_kernel.cu
index 5a2cc8036a158..8e4a65df12263 100644
--- a/paddle/phi/kernels/gpu/reduce_mean_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_mean_kernel.cu
@@ -33,10 +33,13 @@ void MeanRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(mean_raw, KPS, ALL_LAYOUT, phi::MeanRawKernel, float) {}
+#else
 using float16 = phi::dtype::float16;
 
 PD_REGISTER_KERNEL(mean_raw,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::MeanRawKernel,
                    float,
@@ -45,3 +48,4 @@ PD_REGISTER_KERNEL(mean_raw,
                    float16,
                    int,
                    int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu
similarity index 87%
rename from paddle/phi/kernels/gpu/reduce_min_kernel.cu
rename to paddle/phi/kernels/kps/reduce_min_kernel.cu
index ba37d54895d0d..59d69c29decdf 100644
--- a/paddle/phi/kernels/gpu/reduce_min_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu
@@ -33,5 +33,9 @@ void MinRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float) {}
+#else
 PD_REGISTER_KERNEL(
-    min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+    min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/reduce_sum_kernel.cu b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
similarity index 89%
rename from paddle/phi/kernels/gpu/reduce_sum_kernel.cu
rename to paddle/phi/kernels/kps/reduce_sum_kernel.cu
index 28bdbd009bdae..6c039897ddd30 100644
--- a/paddle/phi/kernels/gpu/reduce_sum_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
@@ -33,13 +33,18 @@ void SumRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(sum_raw, KPS, ALL_LAYOUT, phi::SumRawKernel, float) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+#else
 using float16 = phi::dtype::float16;
 using bfloat16 = phi::dtype::bfloat16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
 PD_REGISTER_KERNEL(sum_raw,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
                    bool,
@@ -54,3 +59,4 @@ PD_REGISTER_KERNEL(sum_raw,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
+#endif
diff --git a/paddle/phi/kernels/logspace_kernel.h b/paddle/phi/kernels/logspace_kernel.h
new file mode 100644
index 0000000000000..59862514e78ae
--- /dev/null
+++ b/paddle/phi/kernels/logspace_kernel.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogspaceKernel(const Context& ctx,
+                    const DenseTensor& start,
+                    const DenseTensor& stop,
+                    const DenseTensor& number,
+                    const DenseTensor& base,
+                    DataType dtype,
+                    DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index 1f4ef2ed932e9..4d65dd6dd5d87 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -336,7 +336,7 @@ __device__ __forceinline__ void Reduce(T* out,
         out[i] = reducer(out[i], in[i * NX + j]);
       }
     }
-    BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
+    details::BlockXReduce<T, ReduceFunctor, NY>(out, reducer);
   } else {  // else  kLocalMode
 #pragma unroll
     for (int i = 0; i < NY; ++i) {
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index d2cfdbdec3064..a18fc7cbb3119 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -77,7 +77,7 @@ struct BroadcastConfig {
 #pragma pack()
 
 template <typename T>
-__device__ __forceinline__ void WriteData(T* _global_ptr_ dst,
+__device__ __forceinline__ void WriteData(T _global_ptr_* dst,
                                           T* src,
                                           int num) {
   if (num > 0) {
@@ -403,16 +403,17 @@ template <typename Tx,
           typename IndexCal,
           typename Functor,
           bool IsBoundary = false>
-__device__ __forceinline__ void ReadDataReduce(Ty* dst,
-                                               const Tx* __restrict__ src,
-                                               int block_offset,
-                                               const IndexCal& index_cal,
-                                               int size_nx,
-                                               int size_ny,
-                                               int stride_nx,
-                                               int stride_ny,
-                                               Functor func,
-                                               bool reduce_last_dim) {
+__device__ __forceinline__ void ReadDataReduce(
+    Ty* dst,
+    const Tx _global_ptr_* __restrict__ src,
+    int block_offset,
+    const IndexCal& index_cal,
+    int size_nx,
+    int size_ny,
+    int stride_nx,
+    int stride_ny,
+    Functor func,
+    bool reduce_last_dim) {
   __local__ Tx in_temp[1];
   int thread_offset = 0;
   int left_idx = 0;
diff --git a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
index 8a21e61eaa7d0..b01e0474f2d02 100755
--- a/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/functor_primitives_xpu2.h
@@ -25,6 +25,12 @@ namespace kps {
  */
 template <typename Tx, typename Ty = Tx>
 struct IdentityFunctor {
+#ifdef PADDLE_WITH_XPU_KP
+  HOSTDEVICE inline IdentityFunctor() {}
+  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
+  HOSTDEVICE Ty operator()(const Tx x) const { return static_cast<Ty>(x); }
+  HOSTDEVICE inline void SetDiv(int n) {}
+#else
   inline IdentityFunctor() {}
 
   explicit inline IdentityFunctor(int n) {}
@@ -38,6 +44,7 @@ struct IdentityFunctor {
     return static_cast<Ty>(x);
   }
   __device__ inline void SetDiv(int n) {}
+#endif
 };
 
 /**
diff --git a/paddle/phi/kernels/selected_rows/elementwise_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
similarity index 96%
rename from paddle/phi/kernels/selected_rows/elementwise_kernel.cc
rename to paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 7fba3244a60ee..9fe8eef7ec82a 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/selected_rows/elementwise_kernel.h"
+#include "paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
 
 namespace phi {
 namespace sr {
diff --git a/paddle/phi/kernels/selected_rows/elementwise_kernel.h b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h
similarity index 100%
rename from paddle/phi/kernels/selected_rows/elementwise_kernel.h
rename to paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.h
diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesced_kernel.h
new file mode 100644
index 0000000000000..0755579a57ade
--- /dev/null
+++ b/paddle/phi/kernels/sparse/coalesced_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void CoalescedKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     SparseCooTensor* out);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
new file mode 100644
index 0000000000000..0ebddf9b683f0
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+void CoalescedCPUKernel(const CPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        SparseCooTensor* out) {
+  const DenseTensor& x_indices = x.non_zero_indices();
+  const DenseTensor& x_values = x.non_zero_elements();
+  DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x_values);
+
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz());
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.dims(), sparse_dim, sparse_offsets.data());
+
+  phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     x_indexs.data());
+
+  const T* x_values_ptr = x_values.data<T>();
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+
+  std::map<IntT, std::vector<int64_t>> indices_to_index;
+  for (uint64_t i = 0; i < x_indexs.size(); i++) {
+    IntT index = x_indexs[i];
+    if (indices_to_index.find(index) == indices_to_index.end()) {
+      std::vector<int64_t> indexs;
+      indexs.push_back(i);
+      indices_to_index[index] = indexs;
+    } else {
+      indices_to_index[index].push_back(i);
+    }
+  }
+
+  const int64_t out_nnz = indices_to_index.size();
+
+  out_indices.Resize({x_indices.dims()[0], out_nnz});
+  if (out_values.dims().size() == 1) {
+    out_values.Resize(phi::make_ddim({out_nnz}));
+  } else {
+    out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]}));
+  }
+
+  IntT* out_indices_ptr = out_indices.data<IntT>();
+  T* out_values_ptr = out_values.data<T>();
+  auto iter = indices_to_index.begin();
+
+  Dim<DDim::kMaxRank> const_dims;
+  for (int i = 0; i < x.dims().size(); i++) {
+    const_dims[i] = x.dims()[i];
+  }
+
+  for (int i = 0; iter != indices_to_index.end(); iter++, i++) {
+    phi::funcs::sparse::IndexToCoordinate(
+        iter->first, const_dims, out_nnz, sparse_dim, i, out_indices_ptr);
+    memcpy(out_values_ptr + i * stride,
+           x_values_ptr + iter->second[0] * stride,
+           stride * sizeof(T));
+    for (uint64_t j = 1; j < iter->second.size(); j++) {
+      for (int k = 0; k < stride; k++) {
+        out_values_ptr[i * stride + k] +=
+            x_values_ptr[iter->second[j] * stride + k];
+      }
+    }
+  }
+
+  out->SetMember(out_indices, out_values, x.dims(), true);
+}
+
+template <typename T, typename Context>
+void CoalescedKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] {
+        CoalescedCPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sort,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CoalescedKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index c10a240c68430..1508de407caa7 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -20,7 +20,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+
+#include "paddle/phi/api/ext/dispatch.h"
 
 namespace phi {
 namespace sparse {
@@ -56,10 +58,10 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
   std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
 
   phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      dims, sparse_dim, &sparse_offsets);
+      dims, sparse_dim, sparse_offsets.data());
 
   for (int64_t i = 0; i < non_zero_num; i++) {
-    int64_t index = phi::funcs::sparse::IndicesToIndex<IntT>(
+    int64_t index = phi::funcs::sparse::CoordinateToIndex<IntT>(
         indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
     memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
   }
@@ -98,7 +100,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
   std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
       mask_indexs(mask_indices.dims()[1]);
   phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      x.dims(), sparse_dim, &sparse_offsets);
+      x.dims(), sparse_dim, sparse_offsets.data());
 
   phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
                                      sparse_offsets.data(),
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
new file mode 100644
index 0000000000000..3ffcd28955a53
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -0,0 +1,189 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+void CoalescedGPUKernel(const GPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        SparseCooTensor* out) {
+  const DenseTensor& x_indices = x.non_zero_indices();
+  const DenseTensor& x_values = x.non_zero_elements();
+  DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x_values);
+
+  const int64_t nnz = x.nnz();
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  std::vector<IntT> sparse_offsets(sparse_dim);
+
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.dims(), sparse_dim, sparse_offsets.data());
+
+  DenseTensorMeta sparse_offset_meta(
+      paddle::experimental::CppTypeToDataType<IntT>::Type(),
+      {sparse_dim},
+      DataLayout::NCHW);
+  DenseTensor d_sparse_offsets =
+      phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
+  DenseTensor indexs = phi::Empty(
+      dev_ctx, DenseTensorMeta(x_indices.dtype(), {nnz}, x_indices.layout()));
+  IntT* indexs_ptr = indexs.data<IntT>();
+
+  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     sizeof(IntT) * sparse_dim,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyHostToDevice,
+#else
+                                     cudaMemcpyHostToDevice,
+#endif
+                                     dev_ctx.stream());
+
+  // 1. flatten indices
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      x.non_zero_indices().data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      indexs.numel(),
+      sparse_dim,
+      indexs_ptr);
+
+  // 2. get the address of each non-zero values
+  const T* x_values_ptr = x_values.data<T>();
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+  DenseTensor values_indexs = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {nnz}, DataLayout::NCHW));
+  int* values_indexs_ptr = values_indexs.data<int>();
+  DenseTensor public_indexs = phi::EmptyLike<int>(dev_ctx, values_indexs);
+
+  // values_indexs = [0,1,2,,,nnz-1]
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, &values_indexs, kps::IdentityFunctor<int>());
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, &public_indexs, kps::IdentityFunctor<int>());
+
+// 3. sort (indices, values index)
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      indexs_ptr,
+                      indexs_ptr + nnz,
+                      values_indexs_ptr);
+
+  // 4. unique index
+  thrust::pair<IntT*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            indexs_ptr,
+                            indexs_ptr + nnz,
+                            public_indexs.data<int>());
+
+  phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+      indexs_ptr, new_end.first, out_indices.data<IntT>());
+
+  IntT out_nnz = 0;
+  phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                     out_indices.data<IntT>(),
+                                     sizeof(IntT),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+
+  out_indices.Resize({x_indices.dims()[0], out_nnz});
+  if (out_values.dims().size() == 1) {
+    out_values.Resize(phi::make_ddim({out_nnz}));
+  } else {
+    out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]}));
+  }
+
+  // 5. scatter the values
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
+  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid,
+                                         config.thread_per_block,
+                                         0,
+                                         dev_ctx.stream()>>>(
+      x_values_ptr,
+      public_indexs.data<int>(),
+      values_indexs_ptr,
+      out_nnz,
+      nnz,
+      stride,
+      out_values.data<T>());
+
+  // 6. convert index to coordinate
+  Dim<DDim::kMaxRank> const_dims;
+  for (int i = 0; i < x.dims().size(); i++) {
+    const_dims[i] = x.dims()[i];
+  }
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
+  phi::funcs::sparse::IndexToCoordinateKernel<<<config.block_per_grid,
+                                                config.thread_per_block,
+                                                0,
+                                                dev_ctx.stream()>>>(
+      indexs_ptr, const_dims, out_nnz, sparse_dim, out_indices.data<IntT>());
+
+  out->SetMember(out_indices, out_values, x.dims(), true);
+}
+
+template <typename T, typename Context>
+void CoalescedKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] {
+        CoalescedGPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::CoalescedKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 2396a5975de4e..fcbb3c60183eb 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
@@ -60,46 +61,6 @@ __global__ void GatherKernel(const T* params,
   }
 }
 
-/**
- * brief: scatter add
- * input: the inputs
- * unique_value: refer to UpdateIndexKernel notes
- * out_index: the output feature index
- * non_zero_num: the number of output features
- * rulebook_len: the length of rulebook
- * channels: the output channel size
- * out: the outputs
-**/
-template <typename T>
-__global__ void ScatterKernel(const T* input,
-                              const int* unique_value,
-                              const int* out_index,
-                              const int non_zero_num,
-                              const int rulebook_len,
-                              const int channels,
-                              T* out,
-                              const bool subm = false) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
-
-    int start = unique_value[indices_i];
-    int end = indices_i == non_zero_num - 1 ? rulebook_len
-                                            : unique_value[indices_i + 1];
-    // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
-    for (int j = start; j < end; j++) {
-      const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
-    }
-    out[indices_i * channels + channels_i] = sum;
-  }
-}
-
 template <typename Context, typename IntT = int>
 inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
                                   const IntT* rulebook_ptr,
@@ -186,14 +147,6 @@ __global__ void UpdateIndexKernel(const T* unique_keys,
   }
 }
 
-// brief: calculation the distance between start and end
-template <typename T>
-__global__ void DistanceKernel(const T* start, const T* end, T* distance) {
-  if (threadIdx.x == 0) {
-    *distance = end - start;
-  }
-}
-
 template <typename IntT>
 __global__ void UpdateOutIndexAndCounterAfterLowerBound(
     const IntT* x_indexs,
@@ -402,7 +355,7 @@ int ProductRuleBook(const Context& dev_ctx,
                               rulebook_ptr + rulebook_rows * rulebook_cols,
                               -1);
 
-  DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+  phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
       rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
   IntT rulebook_len = 0;
   phi::backends::gpu::GpuMemcpyAsync(
@@ -468,7 +421,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr,
                                 rulebook_ptr + 3 * rulebook_len,
                                 -1);
-    DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
         rulebook_ptr, last, bound_ptr);
     phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
                                        bound_ptr,
@@ -536,7 +489,7 @@ int ProductRuleBook(const Context& dev_ctx,
     // thrust::distance doesn't support stream parameters
     // const int out_non_zero_num = thrust::distance(unique_key_ptr,
     // new_end.first);
-    DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
         unique_key_ptr,
         new_end,
         rulebook_ptr + rulebook_rows * rulebook_cols - 1);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index ed9579fcd5b67..e54e39f5541d5 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -222,17 +223,18 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * in_channels, 1);
 
-  ScatterKernel<T><<<config.block_per_grid.x,
-                     config.thread_per_block.x,
-                     0,
-                     dev_ctx.stream()>>>(d_x_features_ptr,
-                                         unique_value.data<int>(),
-                                         out_index.data<int>(),
-                                         x.nnz(),
-                                         rulebook_len,
-                                         in_channels,
-                                         x_grad_values_ptr,
-                                         subm);
+  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid.x,
+                                         config.thread_per_block.x,
+                                         0,
+                                         dev_ctx.stream()>>>(
+      d_x_features_ptr,
+      unique_value.data<int>(),
+      out_index.data<int>(),
+      x.nnz(),
+      rulebook_len,
+      in_channels,
+      x_grad_values_ptr,
+      subm);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 93da65dc0f7d8..30f0482a0cc36 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -169,16 +170,17 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   } else {
     config = phi::backends::gpu::GetGpuLaunchConfig1D(
         dev_ctx, out->nnz() * out_channels, 1);
-    ScatterKernel<T><<<config.block_per_grid.x,
-                       config.thread_per_block.x,
-                       0,
-                       dev_ctx.stream()>>>(out_features_ptr,
-                                           unique_value.data<int>(),
-                                           out_index.data<int>(),
-                                           out->nnz(),
-                                           n,
-                                           out_channels,
-                                           out_values_ptr);
+    phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid.x,
+                                           config.thread_per_block.x,
+                                           0,
+                                           dev_ctx.stream()>>>(
+        out_features_ptr,
+        unique_value.data<int>(),
+        out_index.data<int>(),
+        out->nnz(),
+        n,
+        out_channels,
+        out_values_ptr);
   }
 }
 /**
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index dff1cc2318f13..4e2d12f33955e 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 namespace phi {
@@ -123,23 +123,6 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
-// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices
-template <typename IntT>
-__global__ void FlattenIndicesKernel(const IntT* indices,
-                                     const IntT* sparse_offsets,
-                                     const int64_t non_zero_num,
-                                     const int64_t sparse_dim,
-                                     IntT* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  phi::funcs::sparse::FlattenIndices<IntT>(indices,
-                                           sparse_offsets,
-                                           non_zero_num,
-                                           sparse_dim,
-                                           tid,
-                                           gridDim.x * blockDim.x,
-                                           out);
-}
-
 template <typename T, typename IntT>
 __global__ void SparseMaskCopyKernel(const IntT* x_indexs,
                                      const IntT* mask_indexs,
@@ -192,7 +175,8 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   IntT* bound_out_ptr = bound_out.data<IntT>();
 
   // 1. calc the offsets of per dim
-  phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets);
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      x.dims(), sparse_dim, sparse_offsets.data());
   // 2. copy sparse_offsets to device
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
@@ -207,25 +191,27 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   // 3. flatten x indices and mask indices
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
-  FlattenIndicesKernel<<<config.block_per_grid,
-                         config.thread_per_block,
-                         0,
-                         dev_ctx.stream()>>>(x.non_zero_indices().data<IntT>(),
-                                             d_sparse_offsets.data<IntT>(),
-                                             x_indexs.numel(),
-                                             sparse_dim,
-                                             x_indexs_ptr);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      x.non_zero_indices().data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      x_indexs.numel(),
+      sparse_dim,
+      x_indexs_ptr);
 
   config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
-  FlattenIndicesKernel<<<config.block_per_grid,
-                         config.thread_per_block,
-                         0,
-                         dev_ctx.stream()>>>(mask_indices.data<IntT>(),
-                                             d_sparse_offsets.data<IntT>(),
-                                             mask_indexs.numel(),
-                                             sparse_dim,
-                                             mask_indexs_ptr);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      mask_indices.data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      mask_indexs.numel(),
+      sparse_dim,
+      mask_indexs_ptr);
 // 4. call thrust::lower_bound
 #ifdef PADDLE_WITH_HIP
   thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 8cf9c0a28648a..072e6f141f8f1 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -154,9 +155,9 @@ void SparseCooTensorKernel(const Context& dev_ctx,
                            const DenseTensor& indices,
                            const IntArray& dense_shape,
                            SparseCooTensor* out) {
-  *out =
-      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
-  // TODO(zhangkaihuo): sort and merge the dumplicate indices
+  SparseCooTensor before_coalesced(
+      indices, values, phi::make_ddim(dense_shape.GetData()));
+  CoalescedKernel<T, Context>(dev_ctx, before_coalesced, out);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/ops/compat/expand_sig.cc b/paddle/phi/ops/compat/expand_sig.cc
index c3df1595a2108..b0f4ff79b4c5c 100644
--- a/paddle/phi/ops/compat/expand_sig.cc
+++ b/paddle/phi/ops/compat/expand_sig.cc
@@ -17,6 +17,11 @@
 namespace phi {
 
 KernelSignature ExpandOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  const auto& shape = paddle::any_cast<std::vector<int>>(ctx.Attr("shape"));
+  // Infer output shape by Attr("shape") in CompileTime if it is specified.
+  if (!ctx.IsRuntime() && !shape.empty()) {
+    return KernelSignature("expand", {"X"}, {"shape"}, {"Out"});
+  }
   if (ctx.HasInput("Shape")) {
     return KernelSignature("expand", {"X"}, {"Shape"}, {"Out"});
   } else if (ctx.InputSize("expand_shapes_tensor") > 0) {
@@ -27,6 +32,12 @@ KernelSignature ExpandOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }
 
 KernelSignature ExpandGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  const auto& shape = paddle::any_cast<std::vector<int>>(ctx.Attr("shape"));
+  // Infer output shape by Attr("shape") in CompileTime if it is specified.
+  if (!ctx.IsRuntime() && !shape.empty()) {
+    return KernelSignature(
+        "expand_grad", {"X", "Out@GRAD"}, {"shape"}, {"X@GRAD"});
+  }
   if (ctx.HasInput("Shape")) {
     return KernelSignature(
         "expand_grad", {"X", "Out@GRAD"}, {"Shape"}, {"X@GRAD"});
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index a2bd1f2cad9fc..21d5eef4098c0 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -37,13 +37,11 @@ namespace tests {
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, data_transform_same_place) {
   // 1. create tensor
-  auto x = paddle::experimental::full({3, 3},
-                                      1.0,
-                                      experimental::DataType::COMPLEX128,
-                                      experimental::CPUPlace());
+  auto x =
+      paddle::experimental::full({3, 3}, 1.0, DataType::COMPLEX128, CPUPlace());
 
-  auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
+  auto y =
+      paddle::experimental::full({3, 3}, 2.0, DataType::FLOAT32, CPUPlace());
 
   std::vector<phi::dtype::complex<double>> sum(9, 6.0);
 
@@ -75,10 +73,10 @@ TEST(API, data_transform_same_place) {
 TEST(Tensor, data_transform_diff_place) {
   // 1. create tensor
   auto x = paddle::experimental::full(
-      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace());
+      {3, 3}, 1.0, experimental::DataType::FLOAT64, CPUPlace());
 
   auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace());
+      {3, 3}, 2.0, experimental::DataType::FLOAT64, GPUPlace());
 
   std::vector<float> sum(9, 6.0);
 
@@ -93,10 +91,9 @@ TEST(Tensor, data_transform_diff_place) {
   ASSERT_EQ(out.dtype(), phi::DataType::FLOAT64);
   ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
-  ASSERT_EQ(out.impl()->place(),
-            phi::TransToPhiPlace(experimental::Backend::GPU));
+  ASSERT_EQ(out.impl()->place(), phi::TransToPhiPlace(phi::Backend::GPU));
 
-  auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true);
+  auto ref_out = experimental::copy_to(out, CPUPlace(), true);
 
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(ref_out.impl());
   for (size_t i = 0; i < 9; i++) {
diff --git a/paddle/phi/tests/api/test_elementwise_api.cc b/paddle/phi/tests/api/test_elementwise_api.cc
index d4013a788c76c..fb4c68a87cb25 100644
--- a/paddle/phi/tests/api/test_elementwise_api.cc
+++ b/paddle/phi/tests/api/test_elementwise_api.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(subtract, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(multiply, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(divide, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index ca4a264e511bd..e2870a780aeae 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -30,7 +30,7 @@ namespace tests {
 
 TEST(API, scale) {
   auto x = experimental::full(
-      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
+      {3, 4}, 1.0, experimental::DataType::FLOAT32, CPUPlace());
 
   const size_t cycles = 300;
   phi::tests::Timer timer;
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 9552c02976f30..36b200d4d4494 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -16,7 +16,10 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/elementwise_kernel.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_divide_kernel.h"
+#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 6634f5396ac74..2756e3b321150 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -44,11 +44,6 @@ function update_pd_ops() {
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
    python3 generate_phi_kernel_dialect.py
-   # generate test model
-   cd ${PADDLE_ROOT}
-   mkdir -p ${PADDLE_ROOT}/build/models
-   python3 paddle/infrt/tests/models/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
-   python3 paddle/infrt/tests/models/resnet50_model.py ${PADDLE_ROOT}/build/models/resnet50/model
 }
 
 function init() {
@@ -114,6 +109,14 @@ function create_fake_models() {
     # create multi_fc model, this will generate "multi_fc_model"
     python3 -m pip uninstall -y paddlepaddle
     python3 -m pip install  *whl
+
+    # generate test model
+    cd ${PADDLE_ROOT}
+    mkdir -p ${PADDLE_ROOT}/build/models
+    python3 paddle/infrt/tests/models/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs
+    python3 paddle/infrt/tests/models/resnet50_model.py ${PADDLE_ROOT}/build/models/resnet50/model
+    python3 paddle/infrt/tests/models/efficientnet-b4/model.py ${PADDLE_ROOT}/build/models/efficientnet-b4/model
+
     cd ${PADDLE_ROOT}/build
     python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py
     python3 ${PADDLE_ROOT}/paddle/infrt/tests/models/linear.py
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 8b9bfcf46042f..a7a2592f971c5 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -436,7 +436,7 @@ echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -D
 -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
--DNEW_RELEASE_JIT=%NEW_RELEASE_JIT%
+-DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME%
 
 cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
@@ -445,7 +445,7 @@ cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_
 -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub ^
 -DCUDA_TOOLKIT_ROOT_DIR="%CUDA_TOOLKIT_ROOT_DIR%" -DNEW_RELEASE_ALL=%NEW_RELEASE_ALL% -DNEW_RELEASE_PYPI=%NEW_RELEASE_PYPI% ^
--DNEW_RELEASE_JIT=%NEW_RELEASE_JIT%
+-DNEW_RELEASE_JIT=%NEW_RELEASE_JIT% -DWITH_ONNXRUNTIME=%WITH_ONNXRUNTIME%
 goto:eof
 
 :cmake_error
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index be94fd162bc01..8c2ec1acf072a 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -89,6 +89,7 @@
 from .tensor.creation import diagflat  # noqa: F401
 from .tensor.creation import eye  # noqa: F401
 from .tensor.creation import linspace  # noqa: F401
+from .tensor.creation import logspace  # noqa: F401
 from .tensor.creation import ones  # noqa: F401
 from .tensor.creation import ones_like  # noqa: F401
 from .tensor.creation import zeros  # noqa: F401
@@ -592,6 +593,7 @@
            'sqrt',
            'randperm',
            'linspace',
+           'logspace',
            'reshape',
            'reshape_',
            'reverse',
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index 187c7cc02855f..9449b52952cd8 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -97,15 +97,19 @@ def data_generator():
                 if not isinstance(data, list):
                     data = to_list(data)
 
-                if batch_data is None:
-                    batch_data = [[] for i in range(len(data))]
+                if self.batch_size == 1:
+                    yield data
+                    batch_data = None
+                else:
+                    if batch_data is None:
+                        batch_data = [[] for i in range(len(data))]
 
-                for idx in range(len(data)):
-                    batch_data[idx].append(data[idx])
+                    for idx in range(len(data)):
+                        batch_data[idx].append(data[idx])
 
-                if (step + 1) % self.batch_size == 0:
-                    yield batch_data
-                    batch_data = None
+                    if (step + 1) % self.batch_size == 0:
+                        yield batch_data
+                        batch_data = None
 
         dataloader = paddle.fluid.io.DataLoader.from_generator(
             feed_list=self.feed_list, capacity=70, iterable=False)
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index c71ca9b7c6af9..a5fec789dfb37 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -194,6 +194,9 @@ def _parallel_program(self, mode, rank):
             self._apply_post_optimization(dist_main_prog, dist_startup_prog,
                                           rank, dist_params_grads)
         else:
+            # Apply pre optimization passes
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, None, None)
             # Do logical partition
             partitioner = Partitioner(dist_context, rank)
             dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
@@ -231,15 +234,24 @@ def _generate_optimizer(self, main_program, startup_program, params_grads):
 
     def _apply_pre_optimization(self, main_program, startup_program, loss,
                                 params_grads):
+
         # apply amp pass
         if self.strategy.amp:
             config = copy.deepcopy(self.strategy.amp_configs)
             config["dist_context"] = self._dist_contexts[self.mode]
             config["params_grads"] = params_grads
             config["loss"] = loss
-            auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
-            auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                         self._pass_contexts[self.mode])
+            config["input_data"] = self._feed_vars[self.mode][
+                "inputs"] + self._feed_vars[self.mode]["labels"]
+            if config["use_pure_fp16"]:
+                config["base_opt"] = self._optimizer
+                auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
+                auto_parallel_fp16_pass.apply(
+                    [main_program], [startup_program], self._pass_context)
+            else:
+                auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+                auto_parallel_amp_pass.apply([main_program], [startup_program],
+                                             self._pass_context)
 
         # apply recompute pass
         if self.strategy.recompute:
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index db6f909f8ca7d..3f06b34b53ed9 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -28,3 +28,7 @@
 from . import dist_update_loss_scaling
 from . import dist_split
 from . import dist_fill_constant_batch_size_like
+from . import dist_pnorm
+from . import dist_slice
+from . import dist_fused_feedforward
+from . import dist_fused_attention
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
new file mode 100644
index 0000000000000..bc3992ec03d4b
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_attention.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard, is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from .dist_default import DistributedDefaultImpl0
+from ..utils import _get_comm_group, _get_corresponding_rank
+from ..process_group import new_process_group
+
+
+class DistributedFusedAttention(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedFusedAttention, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedFusedAttention("fused_attention"))
+
+
+class DistributedFusedAttentionImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedFusedAttentionImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        qkv_w = op_desc.input('QKVW')[0]
+        qkv_bias = op_desc.input('QKVBias')[0]
+        out_w = op_desc.input('OutLinearW')[0]
+        out_bias = op_desc.input('OutLinearBias')[0]
+
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        qkv_w_dims_mapping = op_dist_attr.get_input_dims_mapping(qkv_w)
+        qkv_bias_dims_mapping = op_dist_attr.get_input_dims_mapping(qkv_bias)
+        out_w_dims_mapping = op_dist_attr.get_input_dims_mapping(out_w)
+        out_bias_dims_mapping = op_dist_attr.get_input_dims_mapping(out_bias)
+
+        head_axis = 1
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        if len(qkv_w_dims_mapping) != 4 or is_dim_replicate(qkv_w_dims_mapping[
+                head_axis]):
+            return False
+        if len(qkv_bias_dims_mapping) != 3 or is_dim_replicate(
+                qkv_bias_dims_mapping[head_axis]):
+            return False
+        if is_dim_replicate(out_w_dims_mapping[0]):
+            return False
+        if is_dim_shard(out_bias_dims_mapping[-1]):
+            return False
+
+        replicated_dims = [
+            qkv_w_dims_mapping[0], qkv_w_dims_mapping[-2],
+            qkv_w_dims_mapping[-1], qkv_bias_dims_mapping[0],
+            qkv_bias_dims_mapping[-1], out_w_dims_mapping[-1],
+            out_bias_dims_mapping[-1]
+        ]
+        for mapping in replicated_dims:
+            if is_dim_shard(mapping):
+                return False
+        if qkv_bias_dims_mapping[head_axis] != qkv_w_dims_mapping[head_axis]:
+            return False
+        if qkv_bias_dims_mapping[head_axis] != out_w_dims_mapping[0]:
+            return False
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+
+        # none of output should be sharded 
+        for out_name in op_desc.output_names():
+            out = op_desc.output(out_name)[0]
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
+            for mapping in out_dims_mapping[1:-1]:
+                if is_dim_shard(mapping):
+                    return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_names = op_desc.output('Y')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            if x_dims_mapping != out_dims_mapping:
+                return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_names = op_desc.output('Y')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            for i in range(len(x_dims_mapping)):
+                dim_changed = compute_compatible_and_update_dim_mapping(
+                    [x_dims_mapping, out_dims_mapping], [i, i])
+                if dim_changed:
+                    changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # infer logic comm presentation
+        head_axis = 1
+        qkv_w = src_op.input('QKVW')[0]
+        qkv_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(qkv_w)[
+            head_axis]
+        assert qkv_w_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            qkv_w_col_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = qkv_w_col_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        # insert op
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+        # setting comm id
+        new_op = main_block.ops[-1]
+        assert new_op.type == "fused_attention"
+        new_op._set_attr("ring_id", int(group.id))
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # infer logic comm presentation
+        out_w = src_op.input('OutLinearW')[0]
+        out_w_col_dim_mapping = op_dist_attr.get_input_dims_mapping(out_w)[-1]
+        assert out_w_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            out_w_col_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = out_w_col_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        # insert op
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+        # setting comm id
+        new_op = main_block.ops[-1]
+        assert new_op.type == "fused_attention_grad"
+        new_op._set_attr("ring_id", int(group.id))
+
+
+register_distributed_operator_impl(
+    "fused_attention", DistributedFusedAttentionImpl("tensor_parallel"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
new file mode 100644
index 0000000000000..76f526adbbfaa
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fused_feedforward.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard, is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from .dist_default import DistributedDefaultImpl0
+from ..utils import _get_comm_group, _get_corresponding_rank
+from ..process_group import new_process_group
+
+
+class DistributedFusedFeedForward(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedFusedFeedForward, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedFusedFeedForward("fused_feedforward"))
+
+
+class DistributedFusedFeedForwardImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedFusedFeedForwardImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        linear1_weight = op_desc.input('Linear1Weight')[0]
+        linear1_bias = op_desc.input('Linear1Bias')[0]
+        linear2_weight = op_desc.input('Linear2Weight')[0]
+        linear2_bias = op_desc.input('Linear2Bias')[0]
+
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        linear1_weight_dims_mapping = op_dist_attr.get_input_dims_mapping(
+            linear1_weight)
+        linear1_bias_dims_mapping = op_dist_attr.get_input_dims_mapping(
+            linear1_bias)
+        linear2_weight_dims_mapping = op_dist_attr.get_input_dims_mapping(
+            linear2_weight)
+        linear2_bias_dims_mapping = op_dist_attr.get_input_dims_mapping(
+            linear2_bias)
+
+        for mapping in x_dims_mapping[1:-1]:
+            if is_dim_shard(mapping):
+                return False
+        if is_dim_shard(linear1_weight_dims_mapping[-2]) or is_dim_replicate(
+                linear1_weight_dims_mapping[-1]):
+            return False
+        if is_dim_replicate(linear1_bias_dims_mapping[-1]):
+            return False
+        if is_dim_replicate(linear2_weight_dims_mapping[-2]) or is_dim_shard(
+                linear2_weight_dims_mapping[-1]):
+            return False
+        if is_dim_shard(linear2_bias_dims_mapping[-1]):
+            return False
+        if linear1_weight_dims_mapping[-1] != linear2_weight_dims_mapping[-2]:
+            return False
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+
+        # none of output should be sharded 
+        for out_name in op_desc.output_names():
+            out = op_desc.output(out_name)[0]
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
+            for mapping in out_dims_mapping[1:-1]:
+                if is_dim_shard(mapping):
+                    return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_names = op_desc.output('Out')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            if x_dims_mapping != out_dims_mapping:
+                return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_names = op_desc.output('Out')
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+
+        for out_name in out_names:
+            out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+            for i in range(len(x_dims_mapping)):
+                dim_changed = compute_compatible_and_update_dim_mapping(
+                    [x_dims_mapping, out_dims_mapping], [i, i])
+                if dim_changed:
+                    changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # infer logic comm presentation
+        linear1_weight = src_op.input('Linear1Weight')[0]
+        linear1_weight_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            linear1_weight)[-1]
+        assert linear1_weight_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            linear1_weight_col_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = linear1_weight_col_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        # insert op
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+        # setting comm id
+        new_op = main_block.ops[-1]
+        assert new_op.type == "fused_feedforward"
+        new_op._set_attr("ring_id", int(group.id))
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        # infer logic comm presentation
+        linear2_weight = src_op.input('Linear2Weight')[0]
+        linear2_weight_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
+            linear2_weight)[-1]
+        assert linear2_weight_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
+            linear2_weight_col_dim_mapping)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+
+        parallel_axis = linear2_weight_col_dim_mapping
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      parallel_axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        # insert op
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+        # setting comm id
+        new_op = main_block.ops[-1]
+        assert new_op.type == "fused_feedforward_grad"
+        new_op._set_attr("ring_id", int(group.id))
+
+
+register_distributed_operator_impl(
+    "fused_feedforward", DistributedFusedFeedForwardImpl("tensor_parallel"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
new file mode 100644
index 0000000000000..ce68e2060218d
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py
@@ -0,0 +1,363 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import paddle
+import paddle.fluid.layers.utils as utils
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from .common import set_comm_op_dist_attr_for_program
+from .dist_default import DistributedDefaultImpl0
+from ..reshard import Resharder
+from ..process_group import new_process_group
+from ..utils import is_dim_shard, is_dim_replicate, _get_corresponding_rank
+from ..utils import compute_compatible_dim_mapping, set_dist_op_desc_original_id, _get_comm_group
+from ..dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
+
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import Operator
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+
+
+class DistributedPNorm(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedPNorm, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedPNorm("p_norm"))
+
+
+def _insert_fill_constant_op(block, op_role):
+    """Insert fill constant op into block at the given index."""
+    helper = LayerHelper("fill_constant", **locals())
+    with paddle.static.program_guard(block.program):
+        out = helper.create_variable_for_type_inference(dtype="int32")
+    inputs = {}
+    attrs = {'force_cpu': False}
+    attrs['str_value'] = str(int("1"))
+    attrs['value'] = int("1")
+    attrs['dtype'] = out.dtype
+    attrs['op_role'] = op_role
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
+    fill_constant_op = block.append_op(
+        type='fill_constant',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs)
+    out.stop_gradient = True
+    return out, fill_constant_op
+
+
+# Row Parallel
+class DistributedPNormImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedPNormImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        if is_dim_replicate(x_dims_mapping[0]):
+            return False
+        # Other dimensions must be replicate except the batch dimension
+        for mapping in x_dims_mapping[1:]:
+            if is_dim_shard(mapping):
+                return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        return True
+
+    def is_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)) or \
+            (not self.is_compatible(dist_op)):
+            return False
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+
+        batch_dim_mappings = []
+        for arg_name in op_desc.input_arg_names():
+            dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
+        for arg_name in op_desc.output_arg_names():
+            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if len(dims_mapping) >= 1:
+                batch_dim_mappings.append(dims_mapping[0])
+
+        compatible_dim_mapping = compute_compatible_dim_mapping(
+            batch_dim_mappings)
+        assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
+
+        for arg_name in op_desc.input_arg_names():
+            dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
+            if len(dims_mapping
+                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
+                dims_mapping[0] = compatible_dim_mapping
+                changed = True
+        for arg_name in op_desc.output_arg_names():
+            dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
+            if len(dims_mapping
+                   ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
+                dims_mapping[0] = compatible_dim_mapping
+                changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert op_dist_attr is not None
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        if rank_id not in op_dist_attr.process_mesh.processes:
+            rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
+                                              rank_id)
+
+        X_var = main_block.var(kwargs['X'][0])
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(X_var.name)
+        for axis in range(len(in_dims_mapping)):
+            if in_dims_mapping[axis] != -1:
+                break
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+        group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
+                                      axis, rank_id)
+        group = new_process_group(group_ranks)
+
+        check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'],
+                                 'norm')
+        check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'],
+                    'norm')
+
+        # 1. insert barrier op
+        ref_process_mesh = op_dist_attr.process_mesh
+        constant_out_dims_mapping = [-1]
+        fill_constant_out, fill_constant_op = _insert_fill_constant_op(
+            main_block, src_op.attr('op_role'))
+        # set fill_constant_out tensor dist_attr
+        constant_out_dist_attr = TensorDistributedAttribute()
+        constant_out_dist_attr.process_mesh = ref_process_mesh
+        constant_out_dist_attr.dims_mapping = constant_out_dims_mapping
+        ctx.set_tensor_dist_attr_for_program(fill_constant_out,
+                                             constant_out_dist_attr)
+        # set fill_constant op dist_attr
+        constant_op_dist_attr = OperatorDistributedAttribute()
+        constant_op_dist_attr.process_mesh = ref_process_mesh
+        constant_op_dist_attr.set_output_dims_mapping(fill_constant_out.name,
+                                                      constant_out_dims_mapping)
+        ctx.set_op_dist_attr_for_program(fill_constant_op,
+                                         constant_op_dist_attr)
+        barrier_op = main_block.append_op(
+            type='barrier',
+            inputs={'X': [fill_constant_out]},
+            outputs={'Out': [fill_constant_out]},
+            attrs={'ring_id': group.id})
+        # set barrier op dist attr
+        set_comm_op_dist_attr_for_program(barrier_op, ref_process_mesh,
+                                          constant_out_dist_attr, ctx)
+
+        # 2. insert c_allgather op
+        # create c_allgather output var
+        allgather_out = main_block.create_var(
+            name=".".join(["c_allgather", X_var.name]),
+            dtype=X_var.dtype,
+            shape=X_var.shape,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=X_var.stop_gradient)
+        # set allgather_out tensor dist_attr
+        allgather_out_dist_attr = TensorDistributedAttribute()
+        allgather_out_dist_attr.process_mesh = op_dist_attr.process_mesh
+        allgather_out_dist_attr.dims_mapping = [
+            -1 for i in range(len(allgather_out.shape))
+        ]
+        ctx.set_tensor_dist_attr_for_program(allgather_out,
+                                             allgather_out_dist_attr)
+        c_allgather_op = main_block.append_op(
+            type='c_allgather',
+            inputs={'X': [X_var]},
+            outputs={'Out': [allgather_out]},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'nranks': group.nranks,
+                'op_role': src_op.attr('op_role')
+            })
+        # set c_allgather op dist_attr
+        allgather_op_dist_attr = OperatorDistributedAttribute()
+        allgather_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        allgather_op_dist_attr.set_input_dims_mapping(X_var.name,
+                                                      in_dims_mapping)
+        allgather_op_dist_attr.set_output_dims_mapping(
+            allgather_out.name, allgather_out_dist_attr.dims_mapping)
+        ctx.set_op_dist_attr_for_program(c_allgather_op, allgather_op_dist_attr)
+
+        # 3. copy p_norm op desc and reset input name
+        # rename input
+        kwargs['X'] = [allgather_out.name]
+        # replicate op in dist program
+        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc.copy_from(src_op.desc)
+        set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx)
+        for input_name in src_op.desc.input_names():
+            dist_op_desc.set_input(input_name, kwargs[input_name])
+        for output_name in src_op.desc.output_names():
+            dist_op_desc.set_output(output_name, kwargs[output_name])
+        pnorm_op = Operator(main_block, dist_op_desc)
+        op_dist_attr.set_input_dims_mapping(
+            allgather_out.name, allgather_out_dist_attr.dims_mapping)
+        ctx.set_op_dist_attr_for_program(pnorm_op, op_dist_attr)
+
+        main_block._sync_with_cpp()
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        backward_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
+        assert op_dist_attr is not None
+
+        # check validation of inputs / outputs
+        for input_name in backward_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                backward_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in backward_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                backward_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        X_var = main_block.var(kwargs['X'][0])
+        X_grad_var = main_block.var(kwargs['X@GRAD'][0])
+
+        # 1. copy p_norm_grad op and reset input name and output name
+        new_kwargs = copy.deepcopy(kwargs)
+        new_kwargs['X'] = [".".join(["c_allgather", X_var.name])]
+        new_X_var = main_block.var(new_kwargs['X'][0])
+        new_X_grad = main_block.create_var(
+            name=".".join(["c_allgather", X_grad_var.name]),
+            dtype=X_grad_var.dtype,
+            shape=new_X_var.shape,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=X_grad_var.stop_gradient)
+        new_kwargs['X@GRAD'] = [new_X_grad.name]
+        new_X_var_dist_attr = ctx.get_tensor_dist_attr_for_program(new_X_var)
+        ctx.set_tensor_dist_attr_for_program(new_X_grad, new_X_var_dist_attr)
+        # replicate op in dist program with new kwargs
+        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc.copy_from(backward_op.desc)
+        # Refer to the related dist op
+        set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
+        for input_name in backward_op.desc.input_names():
+            dist_op_desc.set_input(input_name, new_kwargs[input_name])
+        for output_name in backward_op.desc.output_names():
+            dist_op_desc.set_output(output_name, new_kwargs[output_name])
+        p_norm_grad_op = Operator(main_block, dist_op_desc)
+        op_dist_attr.set_input_dims_mapping(new_X_var.name,
+                                            new_X_var_dist_attr.dims_mapping)
+        op_dist_attr.set_output_dims_mapping(new_X_grad.name,
+                                             new_X_var_dist_attr.dims_mapping)
+        ctx.set_op_dist_attr_for_program(p_norm_grad_op, op_dist_attr)
+        main_block._sync_with_cpp()
+
+        # 2. insert slice op
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        process_mesh_group = op_dist_attr.process_mesh.processes
+        dims_mapping = [0] + [-1 for _ in range(len(new_X_grad.shape) - 1)]
+        partition_idx = Resharder.compute_partition_index(
+            rank_id, new_X_grad.shape, dims_mapping, process_mesh_shape,
+            process_mesh_group)
+        slice_starts = []
+        slice_ends = []
+        slices_axes = []
+        for idx, item in enumerate(partition_idx):
+            slice_starts.append(item[0])
+            slice_ends.append(item[1])
+            slices_axes.append(idx)
+
+        infer_flags = list(1 for i in range(len(slices_axes)))
+        attrs = {
+            "axes": slices_axes,
+            "starts": slice_starts,
+            "ends": slice_ends,
+            "infer_flags": infer_flags,
+            "op_role": backward_op.attr('op_role')
+        }
+        slice_op = main_block.append_op(
+            type='slice',
+            inputs={'Input': [new_X_grad]},
+            outputs={'Out': [X_grad_var]},
+            attrs=attrs)
+        X_grad_var_dims_mapping = op_dist_attr.get_output_dims_mapping(
+            X_grad_var.name)
+        slice_op_dist_attr = OperatorDistributedAttribute()
+        slice_op_dist_attr.process_mesh = op_dist_attr.process_mesh
+        slice_op_dist_attr.set_input_dims_mapping(
+            new_X_grad.name, new_X_var_dist_attr.dims_mapping)
+        slice_op_dist_attr.set_output_dims_mapping(X_grad_var.name,
+                                                   X_grad_var_dims_mapping)
+        ctx.set_op_dist_attr_for_program(slice_op, slice_op_dist_attr)
+        main_block._sync_with_cpp()
+
+
+register_distributed_operator_impl("p_norm",
+                                   DistributedPNormImpl("row_parallel"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
index baae2711dcf48..da6ad933fd514 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py
@@ -349,7 +349,159 @@ def backward(ctx, *args, **kwargs):
         DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
 
 
+class DistributedReshapeImpl2(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedReshapeImpl2, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = False
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        if len(x_dims_mapping) != len(out_dims_mapping):
+            return False
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        x_name = op_desc.input('X')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        if len(x_dims_mapping) != len(out_dims_mapping):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_shape_name = op_desc.output('XShape')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
+            x_shape_name)
+
+        for idx, item in enumerate(x_dims_mapping[:-1]):
+            if out_dims_mapping[idx] != item:
+                return False
+
+        if x_shape_dims_mapping[0] != -1:
+            return False
+
+        if x_shape_dims_mapping[1:] != out_dims_mapping[:]:
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_shape_name = op_desc.output('XShape')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        x_shape_dims_mapping = op_dist_attr.get_output_dims_mapping(
+            x_shape_name)
+
+        for i in range(len(out_dims_mapping) - 1):
+            dim_changed = compute_compatible_and_update_dim_mapping(
+                [x_dims_mapping, out_dims_mapping], [i, i])
+            if dim_changed:
+                changed = True
+
+        for i in range(len(out_dims_mapping)):
+            x_shape_dims_mapping[i + 1] = out_dims_mapping[i]
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        src_op = dist_op_context.cur_src_op
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(src_op))
+
+        # check validation of inputs / outputs
+        for input_name in src_op.desc.input_names():
+            assert input_name in kwargs, "input [{}] is not given".format(
+                input_name)
+            assert len(kwargs[input_name]) == len(
+                src_op.desc.input(input_name)
+            ), "number of tensor for input [{}] is not match".format(input_name)
+        for output_name in src_op.desc.output_names():
+            assert output_name in kwargs, "input [{}] is not given".format(
+                output_name)
+            assert len(kwargs[output_name]) == len(
+                src_op.desc.output(output_name)
+            ), "number of tensor for input [{}] is not match".format(
+                output_name)
+
+        X_var = main_block.var(kwargs['X'][0])
+        Out_var = main_block.var(kwargs['Out'][0])
+        XShape_var = main_block.var(kwargs['XShape'][0])
+        shape_list = src_op.desc.attr("shape")
+        ShapeTensor_var_list = []
+        for name in kwargs['ShapeTensor']:
+            ShapeTensor_var_list.append(name)
+        Shape_var_list = []
+        for name in kwargs['Shape']:
+            Shape_var_list.append(name)
+
+        # got dist attribute info
+        out_dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+
+        # modify target shape
+        for idx, axis in enumerate(out_dim_mapping):
+            if axis >= 0:
+                if len(shape_list) > idx:
+                    shape_list[idx] = shape_list[idx] // process_mesh_shape[
+                        axis]
+
+        # create op
+        new_op_desc = main_block.desc.append_op()
+        new_op_desc.copy_from(src_op.desc)
+        set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx)
+        new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list)
+        new_op_desc.set_input('Shape', Shape_var_list)
+        new_op_desc.set_input('X', [X_var.name])
+        new_op_desc.set_output('XShape', [XShape_var.name])
+        new_op_desc.set_output('Out', [Out_var.name])
+        new_op_desc._set_attr('shape', shape_list)
+
+        main_block._sync_with_cpp()
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
 register_distributed_operator_impl("reshape2",
                                    DistributedReshapeImpl0("add_one_dim_back"))
 register_distributed_operator_impl(
     "reshape2", DistributedReshapeImpl1("remove_one_dim_back"))
+register_distributed_operator_impl("reshape2",
+                                   DistributedReshapeImpl2("same_dim_shape"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_slice.py b/python/paddle/distributed/auto_parallel/operators/dist_slice.py
new file mode 100644
index 0000000000000..4bc0a471dcf1c
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_slice.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard
+from ..utils import compute_compatible_and_update_dim_mapping
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedSlice(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedSlice, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedSlice("slice"))
+
+
+class DistributedSliceImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedSliceImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        in_name = op_desc.input('Input')[0]
+        axes = op_desc.attr('axes')
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+        for axis in axes:
+            if is_dim_shard(in_dims_mapping[axis]):
+                return False
+        return True
+
+    def is_output_compatible(self, dist_op):
+        return True
+
+    def is_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        in_name = op_desc.input('Input')[0]
+        out_name = op_desc.output('Out')[0]
+        decrease_axis = op_desc.attr('decrease_axis')
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        if len(in_dims_mapping) - len(decrease_axis) != 0 and len(
+                out_dims_mapping) != len(in_dims_mapping) - len(decrease_axis):
+            return False
+
+        new_out_dims_mapping = []
+        for i in range(len(in_dims_mapping)):
+            if i not in decrease_axis:
+                new_out_dims_mapping.append(in_dims_mapping[i])
+        if new_out_dims_mapping == []:
+            new_out_dims_mapping = [-1]
+        if new_out_dims_mapping != out_dims_mapping:
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)) or \
+            (not self.is_compatible(dist_op)):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        in_name = op_desc.input('Input')[0]
+        out_name = op_desc.output('Out')[0]
+        decrease_axis = op_desc.attr('decrease_axis')
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        ref_dims_mapping = []
+        for i in range(len(in_dims_mapping)):
+            if i not in decrease_axis:
+                ref_dims_mapping.append(in_dims_mapping[i])
+        if ref_dims_mapping == []:
+            ref_dims_mapping = [-1]
+
+        assert len(ref_dims_mapping) == len(out_dims_mapping)
+        for i in range(len(out_dims_mapping)):
+            if out_dims_mapping[i] != ref_dims_mapping[i]:
+                out_dims_mapping[i] = ref_dims_mapping[i]
+                changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("slice",
+                                   DistributedSliceImpl("decrease_in_axis"))
diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py
index 140336566a146..d0f181a635413 100644
--- a/python/paddle/distributed/auto_parallel/tuner/recorder.py
+++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Notice that the following codes are modified from KerasTuner for a different purpose. 
+# Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/metrics_tracking.py.
+
 import numpy as np
 
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py
index d61e53a027240..63e5eba77f15c 100644
--- a/python/paddle/distributed/auto_parallel/tuner/storable.py
+++ b/python/paddle/distributed/auto_parallel/tuner/storable.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Notice that the following codes are modified from KerasTuner for a different purpose. 
+# Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/metrics_tracking.py.
+
 import json
 
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
index 22a6638c5ca63..1cda82f1edec9 100644
--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Notice that the following codes are modified from KerasTuner to implement our own tuner. 
+# Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/trial.py.
+
 import hashlib
 import random
 import time
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
index f63364c5b75ef..2838a01958433 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Notice that the following codes are modified from KerasTuner to implement our own tuner. 
+# Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/hyperparameters.py.
+
 import collections
 import contextlib
 import copy
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
index 9549b44c48ecb..19f118fdde77a 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Notice that the following codes are modified from KerasTuner to implement our own tuner. 
+# Please refer to https://github.com/keras-team/keras-tuner/blob/master/keras_tuner/engine/hyperparameters.py.
+
 import numpy as np
 
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 993b45b4eecf9..b92b2a3c15dec 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -228,14 +228,23 @@ def _new_process_group_impl(backend,
                             pg_options,
                             group_id=0):
     pg = None
+    genv = _get_global_env()
     assert backend in _valid_backend_list, "Unsupported backend: %s." % backend
     if backend == "gloo":
-        pg = core.ProcessGroupGloo(store, rank, world_size, group_id)
+        place = core.CPUPlace()
+        pg = core.ProcessGroupGloo(store, rank, world_size, place, group_id)
     elif backend == "nccl":
-        pg = core.ProcessGroupNCCL(store, rank, world_size, group_id)
+        place = core.CUDAPlace(genv.device_id)
+        pg = core.ProcessGroupNCCL(store, rank, world_size, place, group_id)
     elif backend == "hccl":
-        pg = core.ProcessGroupHCCL(store, rank, world_size, group_id)
+        place = core.NPUPlace(genv.device_id)
+        pg = core.ProcessGroupHCCL(store, rank, world_size, place, group_id)
     elif backend == "heter":
+        place = None
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(genv.device_id)
+        elif core.is_compiled_with_npu():
+            place = core.NPUPlace(genv.device_id)
         cluster_id = int(os.getenv("CLUSTER_ID", "-1"))
         assert cluster_id >= 0, "please set the CLUSTER_ID variable."
         cluster_size = os.getenv("CLUSTER_SIZE", None)
@@ -253,6 +262,7 @@ def _new_process_group_impl(backend,
             store,
             rank=global_rank,
             world_size=global_world_size,
+            place=place,
             gid=0,
             local_rank=rank,
             local_size=world_size,
@@ -850,9 +860,12 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
 
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
-        tensor_shape = list(tensor.shape)
-        tensor_shape[0] *= group.nranks
-        out = paddle.empty(tensor_shape, tensor.dtype)
+        if len(tensor_list) == 0:
+            tensor_shape = list(tensor.shape)
+            tensor_shape[0] *= group.nranks
+            out = paddle.empty(tensor_shape, tensor.dtype)
+        else:
+            out = paddle.concat(tensor_list, axis=0)
         task = group.process_group.all_gather(tensor, out)
         task.wait()
         tensor_list.clear()
@@ -1773,7 +1786,12 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
     temp = paddle.concat(in_tensor_list, axis=0)
     nranks = len(in_tensor_list)
     if in_dygraph_mode():
-        out = paddle.concat(out_tensor_list, axis=0)
+        if len(out_tensor_list) == 0:
+            tensor_shape = list(in_tensor_list[0].shape)
+            tensor_shape[0] *= nranks
+            out = paddle.empty(tensor_shape, in_tensor_list[0].dtype)
+        else:
+            out = paddle.concat(out_tensor_list, axis=0)
         task = group.process_group.alltoall(temp, out)
         task.wait()
         out_tensor_list.clear()
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index c46b6eeb048a0..9d20e432d8961 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -611,12 +611,15 @@ def set_sparse_table_config(table_data, config):
                                         "DownpourCtrAccessor")
             if accessor_class not in support_sparse_accessor_class:
                 raise ValueError(
-                    "support sparse_accessor_class: [''DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
+                    "support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
                     % (accessor_class))
 
-            if configs.get("use_cvm", True):
-                table_data.accessor.accessor_class = 'CtrCommonAccessor'
+            if accessor_class.find("Double") >= 0:
+                table_data.accessor.accessor_class = 'CtrDoubleAccessor'
             else:
+                table_data.accessor.accessor_class = 'CtrCommonAccessor'
+
+            if not configs.get("use_cvm", True):
                 table_data.accessor.accessor_class = 'SparseAccessor'
 
             table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8)
@@ -624,6 +627,11 @@ def set_sparse_table_config(table_data, config):
             table_data.accessor.embedx_threshold = config.get(
                 'sparse_embedx_threshold', 10)
 
+            if accessor_class == 'DownpourUnitAccessor':
+                table_data.accessor.ctr_accessor_param.show_scale = False
+            else:
+                table_data.accessor.ctr_accessor_param.show_scale = True
+
             table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get(
                 'sparse_nonclk_coeff', 0.1)
             table_data.accessor.ctr_accessor_param.click_coeff = config.get(
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index 9920bbd400c70..a39b77303757a 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -12,6 +12,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# The file has been adapted from the file:
+#     https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/pipe/module.py
+#     Git commit hash: fafc827d643b3eed611e282d909025f16be36601
+# We retain the following license from the original files:
+# MIT License
+
+# Copyright (c) Microsoft Corporation.
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE
+
 import math
 import re
 import glob
@@ -24,6 +50,7 @@
 from paddle.fluid.dygraph.layers import Layer
 from ...utils.log_util import logger, layer_to_str
 from ..pp_utils.utils import _hp_recompute, _initialize_recompute_setting
+from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
 
@@ -269,15 +296,20 @@ def allreduce_shared_weight_gradients(self):
         for key, comm in self.shared_comm.items():
             param = getattr(self.shared_layers[key], comm['weight_attr'])
             # need use trace_op to allreduce weight
-            with paddle.framework.no_grad():
-                paddle.fluid.framework._dygraph_tracer().trace_op(
-                    type="c_allreduce_sum",
-                    inputs={'X': param._grad_ivar()},
-                    outputs={'Out': param._grad_ivar()},
-                    attrs={
-                        'ring_id': comm['group'].id,
-                        'use_calc_stream': True
-                    })
+            if in_dygraph_mode():
+                with paddle.framework.no_grad():
+                    paddle.distributed.all_reduce(
+                        param.grad, group=comm['group'])
+            else:
+                with paddle.framework.no_grad():
+                    paddle.fluid.framework._dygraph_tracer().trace_op(
+                        type="c_allreduce_sum",
+                        inputs={'X': param._grad_ivar()},
+                        outputs={'Out': param._grad_ivar()},
+                        attrs={
+                            'ring_id': comm['group'].id,
+                            'use_calc_stream': True
+                        })
 
     def _segment_network(self, seg_method):
         logger.info("start segment network..")
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 7c7637a90fec0..d2171920f2bb6 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -23,6 +23,7 @@
 from ..utils.log_util import logger
 from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer, HybridParallelGradScaler
 from .pp_utils import p2p_communication as p2p
+import paddle.fluid.core as core
 
 __all__ = []
 
@@ -238,9 +239,9 @@ def _forward_step(self, input_tensor):
                 assert self._layers._loss_fn is not None, "loss function should exist to compute loss"
                 labels = self._load_micro_batch(self.micro_batch_id)
                 output_tensor = self._layers._loss_fn(output_tensor, labels)
-                assert isinstance(
-                    output_tensor, paddle.Tensor
-                ), "Currently, loss_fn should obtain Paddle.Tensor dtype"
+                assert isinstance(output_tensor, (
+                    paddle.Tensor, core.eager.Tensor
+                )), "Currently, loss_fn should obtain Paddle.Tensor dtype"
 
                 with paddle.amp.auto_cast(enable=False):
                     if self.accumulate_steps > 1:
@@ -254,31 +255,33 @@ def _forward_step(self, input_tensor):
         return output_tensor
 
     def _backward_step(self, input_tensor, output_tensor, output_tensor_grad):
-        if self.is_last_stage:
-            assert output_tensor_grad is None
-            if self.scaler:
-                paddle.autograd.backward(self.scaler.scale(output_tensor))
-            else:
-                paddle.autograd.backward(output_tensor)
-        else:
-            if isinstance(output_tensor, tuple):
-                outputs = [t for t in output_tensor if not t.stop_gradient]
-                assert len(outputs) == len(output_tensor_grad)
-                paddle.autograd.backward(
-                    tensors=outputs,
-                    grad_tensors=[t for t in output_tensor_grad])
-            else:
-                paddle.autograd.backward(
-                    tensors=[output_tensor], grad_tensors=[output_tensor_grad])
-
-        input_tensor_grad = None
-        if input_tensor is not None:
-            if isinstance(input_tensor, tuple):
-                input_tensor_grad = tuple(
-                    [t.grad for t in input_tensor if not t.stop_gradient])
+        with paddle.amp.auto_cast(enable=False):
+            if self.is_last_stage:
+                assert output_tensor_grad is None
+                if self.scaler:
+                    paddle.autograd.backward(self.scaler.scale(output_tensor))
+                else:
+                    paddle.autograd.backward(output_tensor)
             else:
-                input_tensor_grad = input_tensor.grad
-        return input_tensor_grad
+                if isinstance(output_tensor, tuple):
+                    outputs = [t for t in output_tensor if not t.stop_gradient]
+                    assert len(outputs) == len(output_tensor_grad)
+                    paddle.autograd.backward(
+                        tensors=outputs,
+                        grad_tensors=[t for t in output_tensor_grad])
+                else:
+                    paddle.autograd.backward(
+                        tensors=[output_tensor],
+                        grad_tensors=[output_tensor_grad])
+
+            input_tensor_grad = None
+            if input_tensor is not None:
+                if isinstance(input_tensor, tuple):
+                    input_tensor_grad = tuple(
+                        [t.grad for t in input_tensor if not t.stop_gradient])
+                else:
+                    input_tensor_grad = input_tensor.grad
+            return input_tensor_grad
 
     def _load_micro_batch(self, cache_id):
         inputs = self.data
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index e2c99edac1270..b6698a200e945 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -17,6 +17,7 @@
 from ...utils.log_util import logger
 import numpy as np
 from paddle import _C_ops
+import paddle.fluid.core as core
 
 _hcg = None
 _use_cache = False
@@ -114,7 +115,7 @@ def _send_dims_shape_dtype(self, tensor, group):
         paddle.distributed.send(stop_grad, dst=1, group=group)
 
     def send_meta(self, tensor, group):
-        if isinstance(tensor, paddle.Tensor):
+        if isinstance(tensor, (paddle.Tensor, core.eager.Tensor)):
             tensor_type = paddle.to_tensor([0])
             # send tensor type
             paddle.distributed.send(tensor_type, dst=1, group=group)
@@ -129,11 +130,11 @@ def send_meta(self, tensor, group):
             paddle.distributed.send(nums, dst=1, group=group)
 
             for d in tensor:
-                assert isinstance(d, paddle.Tensor)
+                assert isinstance(d, (paddle.Tensor, core.eager.Tensor))
                 self._send_dims_shape_dtype(d, group=group)
 
     def set_send_message(self, tensor):
-        if isinstance(tensor, paddle.Tensor):
+        if isinstance(tensor, (paddle.Tensor, core.eager.Tensor)):
             self.send_shape_message = tensor.shape
             self.send_dtype_message = paddle_2_number(tensor.dtype)
         elif isinstance(tensor, tuple):
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index 7224ba6dedda0..59bcf50ffb798 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -17,10 +17,11 @@
 import paddle
 from paddle.fluid import core
 from paddle import _C_ops
-from paddle.autograd import PyLayer
+from paddle.autograd import PyLayer, EagerPyLayer
 from paddle.fluid import framework
 from ...utils.recompute import check_recompute_necessary, detach_variable
 from ..parallel_layers.random import get_rng_state_tracker
+from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
 
@@ -164,6 +165,138 @@ def _swith_rng_state_tracker(rng_state, tracker):
         get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
 
 
+class _HPEagerRecomputeFunction(EagerPyLayer):
+    """
+    Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
+    1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
+    2. Offload support for activation
+    3. Support MP segmentation of activation to further reduce cuda memory
+    4. Adapt to the random state of MP
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, all_outputs, *args):
+        check_recompute_necessary(args)
+
+        # store for recomputing 
+        ctx.run_function = run_function
+
+        # store the rng states
+        ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
+        ).get_states_tracker()
+
+        # save input for backward
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        ctx.tensor_shapes = []
+        tensor_inputs = []
+
+        cur_device = paddle.get_device()
+        assert 'gpu:' in paddle.get_device(
+        ), "Recompute with RNG is not support current device: {}.".format(
+            cur_device)
+
+        # TODO support AMP
+        tracer = framework._dygraph_tracer()
+        ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
+        if tracer._amp_level == core.AmpLevel.O2:
+            ctx.amp_level = 'O2'
+        elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
+            ctx.amp_level = 'O1'
+        else:
+            raise ValueError("unsupported amp level: {}".format(
+                tracer._amp_level))
+        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
+
+        with paddle.no_grad():
+            outputs = run_function(*args)
+
+        for i, arg in enumerate(args):
+            if paddle.is_tensor(arg):
+                state = arg.stop_gradient
+                if _recompute_partition:
+                    ctx.tensor_shapes.append(arg.shape)
+                    partition = _split_activation(arg.detach()).clone()
+                    # TODO(shenliang03) not use calculate stream to D2H to speed
+                    arg = partition.cpu() if _recompute_offload else partition
+                else:
+                    arg = arg.cpu() if _recompute_offload else arg
+                arg.stop_gradient = state
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+
+        ctx.save_for_backward(*tensor_inputs)
+
+        if paddle.is_tensor(outputs):
+            all_outputs += [outputs]
+            return outputs
+        else:
+            all_outputs += outputs
+            return tuple(outputs)
+
+    @staticmethod
+    def backward(ctx, *args):
+        with paddle.fluid.dygraph.guard():
+            # Restore inputs
+            inputs = list(ctx.inputs)
+            tensor_indices = ctx.tensor_indices
+            tensor_shapes = ctx.tensor_shapes
+            tensors = list(ctx.saved_tensor())
+
+            device_id = paddle.distributed.ParallelEnv().device_id
+            for i, idx in enumerate(tensor_indices):
+                if _recompute_partition:
+                    state = tensors[i].stop_gradient
+                    tensors[i] = _merge_activation(tensors[i]).detach(
+                    ).reshape_(tensor_shapes[i])
+                    tensors[i].stop_gradient = state
+                inputs[idx] = tensors[i].cuda(
+                    device_id) if _recompute_offload else tensors[i]
+
+            tracer = framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            # need restore auto_cast state as well as w/b list
+            with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
+                                          ctx.fwd_cuda_rng_state_tracker):
+                with paddle.amp.auto_cast(
+                        enable=ctx.is_fw_autocast,
+                        custom_white_list=ctx.amp_white_list,
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_level):
+                    detached_inputs = detach_variable(tuple(inputs))
+                    outputs = ctx.run_function(*detached_inputs)
+
+            if isinstance(outputs, core.eager.Tensor):
+                outputs = (outputs, )
+            assert len(outputs) == len(args)
+
+            forward_outputs_with_grad = []
+            backward_inputs = []
+
+            for i in range(len(outputs)):
+                if isinstance(
+                        outputs[i],
+                        core.eager.Tensor) and not outputs[i].stop_gradient:
+                    forward_outputs_with_grad.append(outputs[i])
+                    backward_inputs.append(args[i])
+
+            if len(forward_outputs_with_grad) == 0:
+                raise RuntimeError(
+                    "none of output has stop_gradient=False, this recompute() is not necessary"
+                )
+
+            # actually backward            
+            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+            grads = tuple(inp._grad_ivar() for inp in detached_inputs
+                          if isinstance(inp, core.eager.Tensor))
+            return grads
+
+
 class _HPRecomputeFunction(PyLayer):
     """
     Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
@@ -290,8 +423,8 @@ def backward(ctx, *args):
 
             # actually backward            
             paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
-            grads = list(inp._grad_ivar() for inp in detached_inputs
-                         if isinstance(inp, core.VarBase))
+            grads = tuple(inp._grad_ivar() for inp in detached_inputs
+                          if isinstance(inp, core.VarBase))
             return grads
 
 
@@ -303,7 +436,10 @@ def _hp_recompute(function, *args):
     # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
 
     all_outputs = []
-    _HPRecomputeFunction.apply(function, all_outputs, *args)
+    if in_dygraph_mode():
+        _HPEagerRecomputeFunction.apply(function, all_outputs, *args)
+    else:
+        _HPRecomputeFunction.apply(function, all_outputs, *args)
 
     if len(all_outputs) == 1:
         return all_outputs[0]
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
index e03d832767e6f..08c8f0835c5e1 100644
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -51,7 +51,6 @@ def is_legacy_mode(self):
         legacy_env_list = [
             'DISTRIBUTED_TRAINER_ENDPOINTS',
             'PADDLE_ELASTIC_JOB_ID',
-            'PADDLE_DISTRI_BACKEND',
             'FLAGS_START_PORT',
         ]
 
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 9527ae35c4b6b..69b2237f0ba7d 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -128,14 +128,15 @@ def signal_handler(self, sigint, frame):
         self.ctx.logger.info("Terminating with signal {}".format(sigint))
 
         if hasattr(self, 'sigint'):
-            time.sleep(5)
+            self.ctx.logger.info("Force quit in 10 seconds...")
+            time.sleep(11)
             sys.exit(sigint)
 
         self.sigint = sigint
         self.ctx.status.done()
         self.stop(sigint)
         time.sleep(1)
-        self.ctx.logger.debug("Exit with signal {}".format(sigint))
+        self.ctx.logger.info("Exit with signal {}".format(sigint))
         sys.exit(sigint)
 
 
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index 5fdd88ac1de8a..fe94c25e12d2d 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -487,6 +487,7 @@ def __init__(self):
         self.set_attr("incr_ratio", 2.0)
         self.set_attr("decr_ratio", 0.8)
         self.set_attr("use_dynamic_loss_scaling", False)
+        self.set_attr("input_data", [])
         self.set_attr("params_grads", [])
         self._loss_scaling = None
         self._num_good_steps = None
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 725b4459d7d21..69c3eef7e3771 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -95,12 +95,21 @@ def _keep_fp32_output(op, out_name):
 
 
 class FP16State(object):
-    def __init__(self, program, amp_list, dist_context, use_fp16_guard):
+    def __init__(self,
+                 program,
+                 amp_list,
+                 dist_context,
+                 use_fp16_guard,
+                 input_data_var_names=None):
         self.program = program
         self.amp_list = amp_list
         self.use_fp16_guard = use_fp16_guard
         self.dist_context = dist_context
         self.grad_op_to_op_map = self.dist_context.dist_op_context.grad_op_id_to_op_id
+        if input_data_var_names:
+            self.input_data_var_names = input_data_var_names
+        else:
+            self.input_data_var_names = []
         self._op_fp16_dict = {
         }  # op_id --> True/False. 'True' means that the op is should run in fp16 mode.
         # a trick to determine leaf tensor node in program {varname: generator_op_id}
@@ -191,7 +200,7 @@ def resolute_tensor_dtype(self, block):
                         if _keep_fp32_input(op, in_name):
                             continue
                         for in_var_name in op.input(in_name):
-                            if in_var_name not in self.forward_non_leaf_tensors:
+                            if in_var_name not in self.forward_non_leaf_tensors and in_var_name not in self.input_data_var_names:
                                 self.set_var_to_fp16(in_var_name, block)
                     for out_name in op.output_names:
                         if _keep_fp32_output(op, out_name):
@@ -498,10 +507,14 @@ def _apply_single_impl(self, main_program, startup_program, context):
             set(self.get_attr("custom_white_list")),
             set(self.get_attr("custom_black_list")), None)
 
-        # TODO support multiple blocks
+        # NOTE don't not change input data dtype, since it is controled by dataloader 
+        # and which is out of control of FP16 Pass
+        input_data_var_names = [var.name for var in self.get_attr("input_data")]
+
         with paddle.static.program_guard(main_program, startup_program):
             fp16_state = FP16State(main_program, amp_list, self.dist_context,
-                                   self.get_attr("use_fp16_guard"))
+                                   self.get_attr("use_fp16_guard"),
+                                   input_data_var_names)
             is_train = fp16_state._build_state()
 
         if is_train:
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index dd0e63f048ea5..31b1dedbc5fb3 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -113,7 +113,7 @@ class Transform(object):
 
         * _forward_shape
         * _inverse_shape
-        
+
     """
     _type = Type.INJECTION
 
@@ -669,7 +669,7 @@ class IndependentTransform(Transform):
         base (Transform): The base transformation.
         reinterpreted_batch_rank (int): The num of rightmost batch rank that 
             will be reinterpreted as event rank.
-    
+
     Examples:
 
         .. code-block:: python
@@ -743,7 +743,7 @@ class PowerTransform(Transform):
 
     Args:
         power (Tensor): The power parameter.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1017,7 +1017,7 @@ class StackTransform(Transform):
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
 
 
@@ -1141,7 +1141,8 @@ def _forward(self, x):
         offset = x.shape[-1] + 1 - paddle.ones([x.shape[-1]]).cumsum(-1)
         z = F.sigmoid(x - offset.log())
         z_cumprod = (1 - z).cumprod(-1)
-        return F.pad(z, [0, 1], value=1) * F.pad(z_cumprod, [1, 0], value=1)
+        return F.pad(z, [0]*2*(len(x.shape)-1) + [0, 1], value=1) * \
+            F.pad(z_cumprod, [0]*2*(len(x.shape)-1) + [1, 0], value=1)
 
     def _inverse(self, y):
         y_crop = y[..., :-1]
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index ba7692b442f82..adce805195960 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -2054,7 +2054,7 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
             y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
             y = F.relu(y)
             z = paddle.static.gradients([y], x)
-            print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
+            print(z) # [var x@GRAD : LOD_TENSOR.shape(-1, 2, 8, 8).dtype(float32).stop_gradient(False)]
     """
     check_type(targets, 'targets', (framework.Variable, list, tuple),
                'paddle.static.gradients')
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 826deae498cb5..0ba980c3e9233 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -543,7 +543,7 @@ def _dygraph_clip(self, params_grads):
                 clip_input = (clip_var.astype('float16')
                               if g.dtype == core.VarDesc.VarType.FP16 else
                               clip_var)
-                new_grad = layers.elementwise_mul(x=g, y=clip_input)
+                new_grad = _C_ops.elementwise_mul(g, clip_input)
                 params_and_grads.append((p, new_grad))
             else:
                 params_and_grads.append((p, g))
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 193025b1864ab..41c1a0aa5808e 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -36,7 +36,7 @@
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_
+from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.core import VarDesc
 from paddle.fluid.dygraph import no_grad
@@ -918,7 +918,12 @@ def _dygraph_call_func(self, *inputs, **kwargs):
         return outputs
 
     def __call__(self, *inputs, **kwargs):
-        return self._dygraph_call_func(*inputs, **kwargs)
+        if (not in_declarative_mode()) and (not self._forward_pre_hooks) \
+            and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode():
+            self._build_once(*inputs, **kwargs)
+            return self.forward(*inputs, **kwargs)
+        else:
+            return self._dygraph_call_func(*inputs, **kwargs)
 
     def forward(self, *inputs, **kwargs):
         """
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 5b305325f3d2d..8ce56d5a92686 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -62,15 +62,6 @@
 _already_patch_varbase = False
 _already_patch_eager_tensor = False
 
-# Dispatch to final state Python-C functions
-_final_state_op_type_mapping = {
-    "elementwise_add": "final_state_add",
-    "elementwise_sub": "final_state_subtract",
-    "elementwise_div": "final_state_divide",
-    "elementwise_mul": "final_state_multiply",
-    "matmul_v2": "final_state_matmul",
-}
-
 
 def monkey_patch_math_varbase():
     """
@@ -80,9 +71,13 @@ def monkey_patch_math_varbase():
 
     @no_grad
     def create_tensor(value, dtype, shape):
-        out = _varbase_creator(dtype=dtype)
-        out = _C_ops.fill_constant(out, 'dtype', dtype, 'shape', shape, 'value',
-                                   value, 'force_cpu', False)
+        if framework._in_eager_mode_:
+            out = _C_ops.final_state_full(shape, value, dtype,
+                                          framework._current_expected_place())
+        else:
+            out = _varbase_creator(dtype=dtype)
+            out = _C_ops.fill_constant(out, 'dtype', dtype, 'shape', shape,
+                                       'value', value, 'force_cpu', False)
         out.stop_gradient = True
         return out
 
@@ -120,9 +115,9 @@ def astype(self, dtype):
         return _C_ops.final_state_cast(self, dtype)
 
     def _scalar_elementwise_op_(var, scale, bias):
-        if _in_legacy_dygraph():
-            return _C_ops.scale(var, 'scale', scale, 'bias', bias)
-        return _C_ops.final_state_scale(var, float(scale), bias, True)
+        if framework.in_dygraph_mode():
+            return _C_ops.final_state_scale(var, float(scale), bias, True)
+        return _C_ops.scale(var, 'scale', scale, 'bias', bias)
 
     def _neg_(var):
         return _scalar_elementwise_op_(var, -1.0, 0.0)
@@ -203,7 +198,8 @@ def _scalar_div_(var, value):
     def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
-                         scalar_method=None):
+                         scalar_method=None,
+                         call_final_api=False):
         def __impl__(self, other_var):
             # 1. scalar exists cases
             # we need combine the tensor.dtype and scalar.dtype, cast correct object
@@ -287,15 +283,15 @@ def __impl__(self, other_var):
 
             # 4. calculation
             axis = -1
-            if in_dygraph_mode(
-            ) and op_type in _final_state_op_type_mapping.keys():
-                math_op = getattr(_C_ops, _final_state_op_type_mapping[op_type])
-                return math_op(self, other_var)
-            else:
-                math_op = getattr(_C_ops, op_type)
-                return math_op(self, other_var, 'axis', axis)
+            math_op = getattr(_C_ops, op_type)
+            if call_final_api:
+                return math_op(self, other_var, -1)
+            return math_op(self, other_var, 'axis', axis)
 
-        comment = OpProtoHolder.instance().get_op_proto(op_type).comment
+        if call_final_api:
+            comment = ""
+        else:
+            comment = OpProtoHolder.instance().get_op_proto(op_type).comment
 
         __impl__.__doc__ = """
         {0}
@@ -321,28 +317,48 @@ def __impl__(self, other_var):
         ('ndim', _ndim_),
         ('size', _size_),
         ('T', _T_),
-        ('__add__',
-         _binary_creator_('__add__', 'elementwise_add', False, _scalar_add_)),
+        ('__add__', _binary_creator_('__add__', 'final_state_add', False,
+                                     _scalar_add_, True))
+        if framework._in_eager_mode_ else ('__add__', _binary_creator_(
+            '__add__', 'elementwise_add', False, _scalar_add_)),
         ##  a+b == b+a. Do not need to reverse explicitly
-        ('__radd__',
-         _binary_creator_('__radd__', 'elementwise_add', False, _scalar_add_)),
-        ('__sub__', _binary_creator_('__sub__', 'elementwise_sub', False,
-                                     _scalar_sub_)),
-        ('__rsub__', _binary_creator_('__rsub__', 'elementwise_sub', True,
-                                      _scalar_rsub_)),
-        ('__mul__', _binary_creator_('__mul__', 'elementwise_mul', False,
-                                     _scalar_mul_)),
+        ('__radd__', _binary_creator_('__radd__', 'final_state_add', False,
+                                      _scalar_add_, True))
+        if framework._in_eager_mode_ else ('__radd__', _binary_creator_(
+            '__radd__', 'elementwise_add', False, _scalar_add_)),
+        ('__sub__', _binary_creator_('__sub__', 'final_state_subtract', False,
+                                     _scalar_sub_, True))
+        if framework._in_eager_mode_ else ('__sub__', _binary_creator_(
+            '__sub__', 'elementwise_sub', False, _scalar_sub_)),
+        ('__rsub__', _binary_creator_('__rsub__', 'final_state_subtract', True,
+                                      _scalar_rsub_, True))
+        if framework._in_eager_mode_ else ('__rsub__', _binary_creator_(
+            '__rsub__', 'elementwise_sub', True, _scalar_rsub_)),
+        ('__mul__', _binary_creator_('__mul__', 'final_state_multiply', False,
+                                     _scalar_mul_, True))
+        if framework._in_eager_mode_ else ('__mul__', _binary_creator_(
+            '__mul__', 'elementwise_mul', False, _scalar_mul_)),
         ## a*b == b*a. Do not need to reverse explicitly
-        ('__rmul__',
-         _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
-                                     _scalar_div_)),
-        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
-                                         False, _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
-                                      None)),
-        ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
-                                          None)),
+        ('__rmul__', _binary_creator_('__rmul__', 'final_state_multiply', False,
+                                      _scalar_mul_, True))
+        if framework._in_eager_mode_ else ('__rmul__', _binary_creator_(
+            '__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'final_state_divide', False,
+                                     _scalar_div_, True))
+        if framework._in_eager_mode_ else ('__div__', _binary_creator_(
+            '__div__', 'elementwise_div', False, _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'final_state_divide',
+                                         False, _scalar_div_, True))
+        if framework._in_eager_mode_ else ('__truediv__', _binary_creator_(
+            '__truediv__', 'elementwise_div', False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'final_state_divide', True,
+                                      None, True)) if framework._in_eager_mode_
+        else ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                           None)),
+        ('__rtruediv__', _binary_creator_('rtruediv__', 'final_state_divide',
+                                          True, None, True))
+        if framework._in_eager_mode_ else ('__rtruediv__', _binary_creator_(
+            'rtruediv__', 'elementwise_div', True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
@@ -354,11 +370,26 @@ def __impl__(self, other_var):
         ('__matmul__', _binary_creator_('__matmul__', "matmul_v2", False,
                                         None)),
         ## for logical compare
+        ('__eq__',
+         _binary_creator_('__eq__', 'final_state_equal', False, None, True))
+        if framework._in_eager_mode_ else
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
-        ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
-        ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
-        ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__ne__', _binary_creator_('__ne__', 'final_state_not_equal', False,
+                                    None, True)) if framework._in_eager_mode_
+        else ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
+        ('__lt__', _binary_creator_('__lt__', 'final_state_less_than', False,
+                                    None, True)) if framework._in_eager_mode_
+        else ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
+        ('__le__', _binary_creator_('__le__', 'final_state_less_equal', False,
+                                    None, True)) if framework._in_eager_mode_
+        else ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
+        ('__gt__', _binary_creator_('__gt__', 'final_state_greater_than', False,
+                                    None, True))
+        if framework._in_eager_mode_ else
         ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
+        ('__ge__', _binary_creator_('__ge__', 'final_state_greater_equal',
+                                    False, None, True))
+        if framework._in_eager_mode_ else
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
         ('__array_ufunc__', None)
     ]
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 6e1ed6b0a1dec..44a49148ca044 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -72,6 +72,17 @@
         "axis2": "axis2",
         "out": "Out",
     },
+    "roi_align": {
+        "final_op_name": "final_state_roi_align",
+        "x": "X",
+        "boxes": "ROIs",
+        "boxes_num": "RoisNum",
+        "pooled_height": "pooled_height",
+        "pooled_width": "pooled_width",
+        "spatial_scale": "spatial_scale",
+        "sampling_ratio": "sampling_ratio",
+        "aligned": "aligned",
+    },
     # "one_hot": {
     #     "final_op_name": "final_state_one_hot",
     #     "x": "X",
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 03045579e7198..db6af87635ccb 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -271,7 +271,8 @@ def backward(self, grad_tensor=None, retain_graph=False):
             if _grad_scalar:
                 # When using amp with Fleet DistributedStrategy, we do loss scaling implicitly.
                 self = _grad_scalar.scale(self)
-            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu():
+            if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_npu(
+            ) or paddle.is_compiled_with_mlu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
                 if framework._in_eager_mode_:
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a329610eeae83..314a502a3cbef 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -166,6 +166,40 @@ def _in_eager_without_dygraph_check():
     return _in_eager_mode_
 
 
+# FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but
+# only GPU/CPU. Remove this after we improve this feature.
+_is_first_import_ = True
+
+
+def _fallback_legacy_dygraph():
+    global _in_eager_mode_
+    global _is_first_import_
+    need_fallback = False
+    # Only enable eager on CPU/GPU
+    is_not_support = core.is_compiled_with_xpu() or core.is_compiled_with_npu(
+    ) or core.is_compiled_with_ipu() or core.is_compiled_with_mlu(
+    ) or core.is_compiled_with_rocm()
+
+    if _in_eager_mode_ and is_not_support:
+        # switch into legacy dygraph mode
+        warnings.warn(
+            "We will fallback into legacy dygraph on NPU/XPU/MLU/IPU/ROCM devices. Because we only support new eager dygraph mode on CPU/GPU currently. "
+        )
+        _in_eager_mode_ = False
+        if not _is_first_import_:
+            _enable_legacy_dygraph()
+        need_fallback = True
+
+    need_fallback = False
+    _is_first_import_ = False
+
+    return need_fallback
+
+
+# switch into legacy mode if need while import paddle
+_fallback_legacy_dygraph()
+
+
 def in_dygraph_mode():
     """
 
@@ -206,11 +240,16 @@ def _non_static_mode():
 
 @signature_safe_contextmanager
 def _test_eager_guard(place=None):
-    _disable_legacy_dygraph()
+    # FIXME(dev): We haven't fully verified eager mode on XPU/NPU et.al but
+    # only GPU/CPU. Remove this after we improve this feature.
+    already_fallback = _fallback_legacy_dygraph()
+    if not already_fallback:
+        _disable_legacy_dygraph()
     try:
         yield
     finally:
-        _enable_legacy_dygraph()
+        if not already_fallback:
+            _enable_legacy_dygraph()
 
 
 global_ipu_index = None
@@ -6046,8 +6085,8 @@ def list_vars(self):
                 for var in prog.list_vars():
                     print(var)
 
-                # var img : paddle.VarType.LOD_TENSOR.shape(-1, 1, 28, 28).astype(VarType.FP32)
-                # var label : paddle.VarType.LOD_TENSOR.shape(-1, 1).astype(VarType.INT64)
+                # var img : LOD_TENSOR.shape(-1, 1, 28, 28).dtype(float32).stop_gradient(True)
+                # var label : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True)
         """
         for each_block in self.blocks:
             for each_var in list(each_block.vars.values()):
@@ -6080,8 +6119,8 @@ def all_parameters(self):
                 # Here will print all parameters in current program, in this example,
                 # the result is like:
                 #
-                # persist trainable param fc_0.w_0 : paddle.VarType.LOD_TENSOR.shape(13, 10).astype(VarType.FP32)
-                # persist trainable param fc_0.b_0 : paddle.VarType.LOD_TENSOR.shape(10,).astype(VarType.FP32)
+                # persist trainable param fc_0.w_0 : LOD_TENSOR.shape(13, 10).dtype(float32).stop_gradient(False)
+                # persist trainable param fc_0.b_0 : LOD_TENSOR.shape(10,).dtype(float32).stop_gradient(False)
                 #
                 # Here print(param) will print out all the properties of a parameter,
                 # including name, type and persistable, you can access to specific
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 184453a6fcb2b..d143a6637f821 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1796,13 +1796,16 @@ def greater_than(x, y, cond=None, name=None):
 
     attrs = dict()
 
-    helper.append_op(
-        type='greater_than',
-        inputs={'X': [x],
-                'Y': [y]},
-        outputs={'Out': [cond]},
-        attrs=attrs)
-    return cond
+    if in_dygraph_mode():
+        return _C_ops.final_state_greater_than(x, y, -1)
+    else:
+        helper.append_op(
+            type='greater_than',
+            inputs={'X': [x],
+                    'Y': [y]},
+            outputs={'Out': [cond]},
+            attrs=attrs)
+        return cond
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 47f40a2e6a5af..1fdf59948345b 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -196,6 +196,17 @@
     'unbind',
 ]
 
+OP_NAMEMAPPING = {
+    'elementwise_max': 'final_state_maximum',
+    'elementwise_min': 'final_state_minimum',
+    'elementwise_pow': 'final_state_elementwise_pow',
+    'elementwise_floordiv': 'final_state_floor_divide',
+    'elementwise_add': 'final_state_add',
+    'elementwise_sub': 'final_state_subtract',
+    'elementwise_mul': 'final_state_multiply',
+    'elementwise_div': 'final_state_divide',
+}
+
 
 @dygraph_only
 def _elementwise_op_in_dygraph(x,
@@ -204,8 +215,21 @@ def _elementwise_op_in_dygraph(x,
                                act=None,
                                use_mkldnn=False,
                                op_name=None):
-    op = getattr(_C_ops, op_name)
-    out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
+    def is_inplace(op_name):
+        return op_name[-1] == "_"
+
+    if op_name not in OP_NAMEMAPPING.keys() or axis != -1:
+        op = getattr(_C_ops, op_name)
+        out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
+    else:
+        if in_dygraph_mode():
+            op = getattr(_C_ops, OP_NAMEMAPPING[op_name]
+                         if not is_inplace(op_name) else op_name)
+            out = op(x, y)
+
+        if _in_legacy_dygraph():
+            op = getattr(_C_ops, op_name)
+            out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
     return dygraph_utils._append_activation_in_dygraph(
         out, act, use_mkldnn=use_mkldnn)
 
@@ -5093,9 +5117,12 @@ def split(input, num_or_sections, dim=-1, name=None):
             raise TypeError(
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
                 "received %s." % (type(num_or_sections)))
-        out = [_varbase_creator() for n in range(num)]
-        _C_ops.split(input, out, *attrs)
-        return out
+        if in_dygraph_mode():
+            return _C_ops.final_state_split(input, [num], dim)
+        elif _in_legacy_dygraph():
+            out = [_varbase_creator() for n in range(num)]
+            _C_ops.split(input, out, *attrs)
+            return out
 
     check_variable_and_dtype(
         input, 'input',
@@ -7284,7 +7311,12 @@ def roi_align(input,
                                                sampling_ratio=-1,
                                                rois_num=rois_num)
     """
-    if _non_static_mode():
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        return _C_ops.final_state_roi_align(
+            input, rois, rois_num, pooled_height, pooled_width, spatial_scale,
+            sampling_ratio, False)
+    if _in_legacy_dygraph():
         assert rois_num is not None, "rois_num should not be None in dygraph mode."
         align_out = _C_ops.roi_align(
             input, rois, rois_num, "pooled_height", pooled_height,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 95db9d39c1ec4..bb14fb9a86f15 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -2848,7 +2848,11 @@ def _finish_update(self, block, parameters_and_grads):
                 beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
                                                       param)
                 if framework._non_static_mode():
-                    tmp = _C_ops.scale(beta1_pow_acc, "scale", self._beta1)
+                    if framework.in_dygraph_mode():
+                        tmp = _C_ops.final_state_scale(beta1_pow_acc,
+                                                       self._beta1, 0.0, True)
+                    else:
+                        tmp = _C_ops.scale(beta1_pow_acc, "scale", self._beta1)
                     beta1_pow_acc.copy_(tmp, False)
                 else:
                     block.append_op(
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index d58ef6ddd52ca..ed28a2813e225 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -134,7 +134,11 @@ def __call__(self, param, grad, block):
         assert isinstance(block, framework.Block)
 
         if framework._non_static_mode():
-            return _C_ops.scale(param, "scale", self._regularization_coeff)
+            if framework.in_dygraph_mode():
+                return _C_ops.final_state_scale(
+                    param, self._regularization_coeff, 0.0, True)
+            else:
+                return _C_ops.scale(param, "scale", self._regularization_coeff)
         else:
             decay = block.create_var(
                 dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
diff --git a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
index 6b0edcc7ab148..9286ae7ca0091 100644
--- a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
@@ -22,8 +22,7 @@
 
 std::vector<paddle::Tensor> ContextPoolTest(const paddle::Tensor& x) {
   // 1. test cpu context
-  paddle::experimental::Place cpu_place(
-      paddle::experimental::AllocationType::CPU);
+  paddle::Place cpu_place(paddle::experimental::AllocationType::CPU);
   auto* cpu_ctx =
       paddle::experimental::DeviceContextPool::Instance()
           .Get<paddle::experimental::AllocationType::CPU>(cpu_place);
@@ -34,8 +33,7 @@ std::vector<paddle::Tensor> ContextPoolTest(const paddle::Tensor& x) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // 2. test gpu context
-  paddle::experimental::Place gpu_place(
-      paddle::experimental::AllocationType::GPU);
+  paddle::Place gpu_place(paddle::experimental::AllocationType::GPU);
   auto* gpu_ctx =
       paddle::experimental::DeviceContextPool::Instance()
           .Get<paddle::experimental::AllocationType::GPU>(gpu_place);
diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
index 66cc36c300e9d..80f76e2df54fe 100644
--- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
@@ -75,7 +75,7 @@ std::vector<paddle::Tensor> ConcatForwardDynamicAxis(
   auto out_shape = ComputeOutShape(in_shapes, axis);
 
   // create output
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape);
+  auto out = paddle::empty(out_shape, inputs[0].type(), paddle::CPUPlace());
 
   // calc
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
@@ -106,7 +106,7 @@ std::vector<paddle::Tensor> ConcatBackwardDynamicAxis(
   // create outputs
   std::vector<paddle::Tensor> grad_inputs;
   for (auto& t : inputs) {
-    auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape());
+    auto grad = paddle::empty(t.shape(), t.dtype(), t.place());
     grad_inputs.emplace_back(grad);
   }
 
@@ -161,7 +161,7 @@ std::vector<paddle::Tensor> ConcatForwardStaticAxis(
   auto out_shape = ComputeOutShape(in_shapes, final_axis);
 
   // create output
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape);
+  auto out = paddle::empty(out_shape, inputs[0].type(), paddle::CPUPlace());
 
   // calc
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
@@ -190,7 +190,7 @@ std::vector<paddle::Tensor> ConcatBackwardStaticAxis(
   // create outputs
   std::vector<paddle::Tensor> grad_inputs;
   for (auto& t : inputs) {
-    auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape());
+    auto grad = paddle::empty(t.shape(), t.dtype(), t.place());
     grad_inputs.emplace_back(grad);
   }
 
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
index b9c10f479e0a3..56938552420e7 100644
--- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -71,7 +71,7 @@ void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) {
 std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
   CHECK_INPUT(x);
 
-  paddle::Tensor out(x.place(), x.shape());
+  paddle::Tensor out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "ConjCPUKernel", ([&] {
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index 121a855a18f84..04399a9826cfe 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -54,7 +54,7 @@ void relu_cpu_double_backward_kernel(const data_t* out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
@@ -68,7 +68,7 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
                                               const paddle::Tensor& out,
                                               const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto grad_x = paddle::empty(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
@@ -85,7 +85,7 @@ std::vector<paddle::Tensor> relu_cpu_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx) {
   CHECK_CPU_INPUT(out);
   CHECK_CPU_INPUT(ddx);
-  auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] {
                                relu_cpu_double_backward_kernel<data_t>(
@@ -165,7 +165,7 @@ PD_BUILD_DOUBLE_GRAD_OP(custom_relu)
 
 std::vector<paddle::Tensor> relu_cpu_backward_without_x(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 364a2216b9e8e..18f1a2b95c2ee 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -54,7 +54,7 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
 
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   CHECK_GPU_INPUT(x);
-  auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
+  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   int numel = x.size();
   int block = 512;
@@ -74,7 +74,7 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
   CHECK_GPU_INPUT(x);
   CHECK_GPU_INPUT(out);
   CHECK_GPU_INPUT(grad_out);
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
+  auto grad_x = paddle::empty(x.shape(), x.dtype(), x.place());
 
   int numel = out.size();
   int block = 512;
@@ -95,7 +95,7 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx) {
   CHECK_GPU_INPUT(out);
   CHECK_GPU_INPUT(ddx);
-  auto ddout = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
   int64_t numel = out.size();
   int64_t block = 512;
@@ -117,7 +117,7 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
 
 std::vector<paddle::Tensor> relu_cuda_backward_without_x(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
+  auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
   int numel = out.size();
   int block = 512;
diff --git a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc
index f96297d69bd5b..399eb5b6366d7 100644
--- a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc
@@ -68,7 +68,7 @@ void tanh_cpu_double_backward_kernel(const data_t* out_data,
 
 std::vector<paddle::Tensor> TanhForward(const paddle::Tensor& x) {
   CHECK_CPU_INPUT(x);
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.dtype(), "tanh_cpu_forward", ([&] {
@@ -82,7 +82,7 @@ std::vector<paddle::Tensor> TanhForward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> TanhBackward(const paddle::Tensor& out,
                                          const paddle::Tensor& grad_out) {
   CHECK_CPU_INPUT(out);
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_backward", ([&] {
                                tanh_cpu_backward_kernel<data_t>(
@@ -101,8 +101,8 @@ std::vector<paddle::Tensor> TanhDoubleBackward(const paddle::Tensor& out,
   CHECK_CPU_INPUT(out);
   CHECK_CPU_INPUT(ddx);
   CHECK_CPU_INPUT(dout);
-  auto dout_new = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
-  auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto dout_new = paddle::empty(out.shape(), out.dtype(), out.place());
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_double_backward", ([&] {
                                tanh_cpu_double_backward_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 582b14c82b52d..55c9571d44f11 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -20,6 +20,7 @@
 import paddle.static as static
 import subprocess
 import numpy as np
+from paddle.vision.transforms import Compose, Normalize
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.fluid.framework import _test_eager_guard
 
@@ -329,6 +330,33 @@ def test_func_double_grad_dynamic(self):
                     "custom op dx grad: {},\n paddle api dx grad: {}".format(
                         dx_grad, pd_dx_grad))
 
+    def test_with_dataloader(self):
+        for device in self.devices:
+            paddle.set_device(device)
+            # data loader
+            transform = Compose(
+                [Normalize(
+                    mean=[127.5], std=[127.5], data_format='CHW')])
+            train_dataset = paddle.vision.datasets.MNIST(
+                mode='train', transform=transform)
+            train_loader = paddle.io.DataLoader(
+                train_dataset,
+                batch_size=64,
+                shuffle=True,
+                drop_last=True,
+                num_workers=0)
+
+            for batch_id, (image, _) in enumerate(train_loader()):
+                out = self.custom_ops[0](image)
+                pd_out = paddle.nn.functional.relu(image)
+                self.assertTrue(
+                    np.array_equal(out, pd_out),
+                    "custom op out: {},\n paddle api out: {}".format(out,
+                                                                     pd_out))
+
+                if batch_id == 5:
+                    break
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f7f88ab76f227..5235b7f1e88ab 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1137,7 +1137,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 350)
     set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
     set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
-    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 500)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 87031fe09e5a8..97a3092f11fd2 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -18,5 +18,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS})
     py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS})
     py_test_modules(test_new_cost_model MODULES test_new_cost_model ENVS ${dist_ENVS})
+    py_test_modules(test_dist_reshape MODULES test_dist_reshape ENVS ${dist_ENVS})
+    py_test_modules(test_dist_pnorm MODULES test_dist_pnorm ENVS ${dist_ENVS})
+    py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
new file mode 100644
index 0000000000000..946f33b7e4f31
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.distributed.auto_parallel as auto
+
+from paddle.fluid import program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+
+def make_program_dp2():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
+        x.stop_gradient = False
+        auto.shard_tensor(
+            x,
+            dist_attr={
+                "process_mesh": auto.ProcessMesh([0, 1]),
+                "dims_mapping": [0, -1, -1]
+            })
+        tmp_0 = paddle.norm(x, p=2)
+    return main_program, start_program, tmp_0
+
+
+def make_program_serial():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
+        x.stop_gradient = False
+        auto.shard_tensor(
+            x,
+            dist_attr={
+                "process_mesh": auto.ProcessMesh([0]),
+                "dims_mapping": [-1, -1, -1]
+            })
+        tmp_0 = paddle.norm(x, p=2)
+    return main_program, start_program, tmp_0
+
+
+def parallelizer(program_func, rank):
+    from paddle.distributed.auto_parallel.completion import Completer
+    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+
+    main_program, start_program, loss = program_func()
+
+    dist_context = DistributedContext()
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation(main_program)
+    dist_context.block_state.parse_forward_blocks(main_program)
+
+    with program_guard(main_program, start_program):
+        params_grads = append_backward(
+            loss, distop_context=dist_context.dist_op_context)
+    completer.complete_backward_annotation(main_program)
+
+    dist_context.block_state.parse_backward_blocks(main_program)
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
+                                                 [])
+
+    return dist_main_prog, dist_context
+
+
+class TestDistPNorm(unittest.TestCase):
+    def test_dist_pnorm_dp2(self):
+
+        for rank in range(2):
+            dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
+            ops = dist_main_prog.global_block().ops
+            op_types = []
+            for op in ops:
+                op_types.append(op.type)
+                op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+                if op.type == "p_norm":
+                    assert op_dist_attr.impl_type == "p_norm"
+                if op.type in ["p_norm", "p_norm_grad"]:
+                    for input_attr in op_dist_attr.inputs_dist_attrs.values():
+                        assert set(input_attr.dims_mapping) == set([-1])
+                    for output_attr in op_dist_attr.outputs_dist_attrs.values():
+                        assert set(output_attr.dims_mapping) == set([-1])
+                if op.type == 'c_allgather':
+                    for input_attr in op_dist_attr.inputs_dist_attrs.values():
+                        assert input_attr.dims_mapping[0] == 0
+                        assert set(input_attr.dims_mapping[1:]) == set([-1])
+                    for output_attr in op_dist_attr.outputs_dist_attrs.values():
+                        assert set(output_attr.dims_mapping) == set([-1])
+                if op.type == 'slice':
+                    for input_attr in op_dist_attr.inputs_dist_attrs.values():
+                        assert set(input_attr.dims_mapping) == set([-1])
+                    for output_attr in op_dist_attr.outputs_dist_attrs.values():
+                        assert output_attr.dims_mapping[0] == 0
+                        assert set(output_attr.dims_mapping[1:]) == set([-1])
+
+            assert op_types == [
+                "fill_constant", "barrier", "c_allgather", "p_norm",
+                "fill_constant", "p_norm_grad", "slice"
+            ]
+
+    def test_dist_pnorm_serial(self):
+        dist_main_prog, dist_context = parallelizer(make_program_serial, 0)
+        ops = dist_main_prog.global_block().ops
+        for op in ops:
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            assert op_dist_attr.impl_type == "default"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
new file mode 100644
index 0000000000000..f170dbc9095f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.distributed.auto_parallel as auto
+
+from paddle.fluid import program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+
+def make_program_dp2():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
+        x.stop_gradient = False
+        auto.shard_tensor(
+            x,
+            dist_attr={
+                "process_mesh": auto.ProcessMesh([0, 1]),
+                "dims_mapping": [0, -1, -1]
+            })
+        tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2])
+        tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8])
+        tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1))
+    return main_program, start_program
+
+
+def parallelizer(program_func, rank):
+    from paddle.distributed.auto_parallel.completion import Completer
+    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+
+    main_program, start_program = program_func()
+
+    dist_context = DistributedContext()
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation(main_program)
+    dist_context.block_state.parse_forward_blocks(main_program)
+
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
+                                                 [])
+
+    return dist_main_prog, dist_context
+
+
+class TestDistReshape(unittest.TestCase):
+    def test_dist_reshape_mp2(self):
+
+        for rank in range(2):
+            dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
+            ops = dist_main_prog.global_block().ops
+            print_program_with_dist_attr(dist_main_prog, dist_context)
+            for idx, op in enumerate(ops):
+                op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+                assert op_dist_attr.impl_type == "reshape2"
+                assert op_dist_attr.impl_idx == idx
+
+                if op_dist_attr.impl_idx == 2:
+                    assert op.desc.attr('shape')[0] == 2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
new file mode 100644
index 0000000000000..6cf4621dbb0ce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.distributed.auto_parallel as auto
+
+paddle.enable_static()
+
+
+def make_program_dp2():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
+        auto.shard_tensor(
+            x,
+            dist_attr={
+                "process_mesh": auto.ProcessMesh([0, 1]),
+                "dims_mapping": [0, -1, -1]
+            })
+        tmp_0 = x[0]
+        tmp_1 = x[:, 0, :]
+        tmp_2 = x[:, :, 1]
+        tmp_3 = x[:2, :2, :2]
+    return main_program, start_program
+
+
+def make_program_serial():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
+        auto.shard_tensor(
+            x,
+            dist_attr={
+                "process_mesh": auto.ProcessMesh([0]),
+                "dims_mapping": [-1, -1, -1]
+            })
+        tmp_0 = x[0]
+        tmp_1 = x[:, 0, :]
+        tmp_2 = x[:, :, 1]
+        tmp_3 = x[2, 2, :]
+        tmp_4 = x[:2, :2, :2]
+        tmp_5 = x[0, 0, 0]
+    return main_program, start_program
+
+
+def parallelizer(program_func, rank):
+    from paddle.distributed.auto_parallel.completion import Completer
+    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+
+    main_program, start_program = program_func()
+
+    dist_context = DistributedContext()
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation(main_program)
+
+    dist_context.block_state.parse_forward_blocks(main_program)
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
+                                                 [])
+
+    return dist_main_prog, dist_context
+
+
+class TestDistSlice(unittest.TestCase):
+    def test_dist_slice_dp2(self):
+
+        for rank in range(2):
+            dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
+            ops = dist_main_prog.global_block().ops
+            for op in ops:
+                axes = op.desc.attr('axes')
+                op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+                if axes[0] == 0:
+                    assert op_dist_attr.impl_type == "default"
+                else:
+                    assert op_dist_attr.impl_type == "slice"
+                    for out in op.output_arg_names:
+                        var_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            out)
+                        assert var_dims_mapping[0] == 0
+
+    def test_dist_slice_serial(self):
+        dist_main_prog, dist_context = parallelizer(make_program_serial, 0)
+        ops = dist_main_prog.global_block().ops
+        for op in ops:
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
+            assert op_dist_attr.impl_type == "slice"
+            for out in op.output_arg_names:
+                var_dims_mapping = op_dist_attr.get_output_dims_mapping(out)
+                ref_dims_mapping = [-1 for i in range(len(var_dims_mapping))]
+                assert ref_dims_mapping == ref_dims_mapping
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
index 84d11670027fe..608bdd7a35d3f 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_amp.py
@@ -112,10 +112,11 @@ def test_pp_model(self):
 
             with paddle.amp.auto_cast():
                 loss_a = model_a(img, label)
-                scaler_a.scale(loss_a).backward()
-                scaler_a.minimize(optimizer_a, loss_a)
-                optimizer_a.clear_grad()
-                scheduler_a.step()
+
+            scaler_a.scale(loss_a).backward()
+            scaler_a.minimize(optimizer_a, loss_a)
+            optimizer_a.clear_grad()
+            scheduler_a.step()
 
             with paddle.amp.auto_cast():
                 loss_b = model_b.train_batch(
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
index 9042cdba97675..4893960345ea7 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_fp16.py
@@ -124,12 +124,12 @@ def test_pp_model(self):
 
             with paddle.amp.auto_cast(enable=True, level='O2'):
                 loss_a = model_a(img, label)
-                scaler_a.scale(loss_a).backward()
-                with paddle.amp.auto_cast(enable=False):
-                    scaler_a.minimize(optimizer_a, loss_a)
-                optimizer_a.clear_grad()
-                scheduler_a.step()
+            scaler_a.scale(loss_a).backward()
+            scaler_a.minimize(optimizer_a, loss_a)
+            optimizer_a.clear_grad()
+            scheduler_a.step()
 
+            with paddle.amp.auto_cast(enable=True, level='O2'):
                 loss_b = model_b.train_batch(
                     [img, label], optimizer_b, scheduler_b, scaler=scaler_b)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py
index 5b23669b98daa..8f5d7823cdf0f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shape_op.py
@@ -53,7 +53,7 @@ def sample_predictor_configs(self, program_config):
     @given(
         in_shape=st.lists(
             st.integers(
-                min_value=1, max_value=3), min_size=1, max_size=9),
+                min_value=1, max_value=3), min_size=1, max_size=6),
         in_dtype=st.sampled_from([np.float32, np.uint16, np.int8, np.uint8]))
     def test(self, *args, **kwargs):
         self.run_test(quant=False, *args, **kwargs)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py
new file mode 100644
index 0000000000000..5b7ce30728cbc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cumsum_op_mlu.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+
+
+class TestMLUCumSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.set_mlu()
+        self.init_dtype()
+        self.init_testcase()
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_testcase(self):
+        self.attrs = {'axis': 2}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
+
+
+class TestMLUCumSumOp2(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+
+class TestMLUCumSumOp3(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 1}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+
+class TestMLUCumSumOp4(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 0}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+
+class TestMLUCumSumOp5(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.inputs = {'X': np.random.random((5, 20)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+
+class TestMLUCumSumOp7(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.inputs = {'X': np.random.random((100)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+
+class TestNPUCumSumExclusive1(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((4, 5, 65)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive2(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((1, 1, 888)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive3(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((4, 5, 888)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive4(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((1, 1, 3049)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive5(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((4, 5, 3096)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumReverseExclusive(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
+        a = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.inputs = {'X': a}
+        a = np.flip(a, axis=2)
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.flip(
+                    a[:, :, :-1].cumsum(axis=2), axis=2), np.zeros(
+                        (4, 5, 1), dtype=self.dtype)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumWithFlatten1(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'flatten': True}
+        self.inputs = {'X': np.random.random((5, 6)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum()}
+
+
+class TestNPUCumSumWithFlatten2(TestMLUCumSumOp):
+    def init_testcase(self):
+        self.attrs = {'flatten': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum()}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
new file mode 100644
index 0000000000000..9ca5359e05ff7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_sub_op_mlu.py
@@ -0,0 +1,208 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2022
+
+
+class TestElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Out': self.out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            max_relative_error=0.005,
+            no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            max_relative_error=0.005,
+            no_grad_set=set('Y'))
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseSubOp_scalar(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_Vector(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_broadcast_0(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(100, 3, 2).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_2(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100)
+        }
+
+
+class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_4(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 5, 3, 12).astype(np.float32),
+            'Y': np.random.rand(2, 5, 1, 12).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_commonuse_1(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_commonuse_2(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 1, 4).astype(np.float32),
+            'Y': np.random.rand(10, 1, 12, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseSubOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(10, 12).astype(np.float32),
+            'Y': np.random.rand(2, 3, 10, 12).astype(np.float32)
+        }
+        self.attrs = {'axis': 2}
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 12) - self.inputs['Y']
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py
new file mode 100644
index 0000000000000..065c8072d4ce8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_any_like_op_mlu.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import unittest
+import numpy as np
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestFillAnyLikeOp(OpTest):
+    def setUp(self):
+        self.init_dtype()
+        self.set_mlu()
+        self.op_type = "fill_any_like"
+        self.set_value()
+        self.set_input()
+        self.attrs = {'value': self.value}
+        self.outputs = {'Out': self.value * np.ones_like(self.inputs["X"])}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+    def set_input(self):
+        self.inputs = {'X': np.random.random((219, 232)).astype(self.dtype)}
+
+    def set_value(self):
+        self.value = 0.0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillAnyLikeOp2(TestFillAnyLikeOp):
+    def set_value(self):
+        self.value = -0.0
+
+
+class TestFillAnyLikeOp3(TestFillAnyLikeOp):
+    def set_value(self):
+        self.value = 1.0
+
+
+class TestFillAnyLikeOp4(TestFillAnyLikeOp):
+    def set_value(self):
+        self.value = 1e-9
+
+
+class TestFillAnyLikeOp5(TestFillAnyLikeOp):
+    def set_value(self):
+        if self.dtype == "float16":
+            self.value = 0.05
+        else:
+            self.value = 5.0
+
+
+class TestFillAnyLikeOpInt32(TestFillAnyLikeOp):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+    def set_value(self):
+        self.value = -1
+
+
+class TestFillAnyLikeOpInt64(TestFillAnyLikeOp):
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def set_value(self):
+        self.value = -1
+
+
+class TestFillAnyLikeOpFloat32(TestFillAnyLikeOp):
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_value(self):
+        self.value = 0.09
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py
new file mode 100644
index 0000000000000..f0aff986fa1ff
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_op_mlu.py
@@ -0,0 +1,179 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append('..')
+from op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+from paddle.fluid.dygraph.base import switch_to_static_graph
+
+paddle.enable_static()
+
+
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.place = paddle.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.gather
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            'X': xnp,
+            'Index': np.array(self.index).astype(self.index_type)
+        }
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase1(TestGatherOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase2(TestGatherOp):
+    def config(self):
+        """
+        For int64_t index type
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+
+class API_TestDygraphGather(unittest.TestCase):
+    def test_out1(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype('int32')
+        index_1 = np.array([1, 2])
+        input = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([[3, 4], [5, 6]]).astype('int32')
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+    def test_out12(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype('int32')
+        index_1 = np.array([1, 2])
+        x = paddle.to_tensor(input_1)
+        index = paddle.to_tensor(index_1)
+        output = paddle.gather(x, index, axis=0)
+        output_np = output.numpy()
+        expected_output = gather_numpy(input_1, index_1, axis=0)
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+    def test_zero_index(self):
+        paddle.disable_static()
+        x = paddle.to_tensor([[1, 2], [3, 4]]).astype('int32')
+        index = paddle.to_tensor(np.array([]).astype('int64'))
+        for axis in range(len(x.shape)):
+            out = paddle.gather(x, index, axis)
+            expected_shape = list(x.shape)
+            expected_shape[axis] = 0
+            self.assertEqual(list(out.shape), expected_shape)
+        paddle.enable_static()
+
+
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+
+            shape = [8, 9, 6]
+            x = paddle.fluid.data(shape=shape, dtype='int8', name='x')
+            axis = paddle.fluid.data(shape=[1], dtype='float32', name='axis')
+            index = paddle.fluid.data(shape=shape, dtype='int32', name='index')
+            index_float = paddle.fluid.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
+
+            self.assertRaises(TypeError, test_axis_dtype)
+
+            def test_axis_dtype1():
+                paddle.gather(x, index, axis=axis)
+
+            self.assertRaises(TypeError, test_axis_dtype1)
+
+    def test_error2(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+
+            shape = [8, 9, 6]
+            x = fluid.data(shape=shape, dtype='int8', name='x')
+            index = fluid.data(shape=shape, dtype='int32', name='mask')
+            index_float = fluid.data(
+                shape=shape, dtype='float32', name='index_float')
+
+            def test_x_type():
+                paddle.fluid.layers.gather(x, index)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_index_type():
+                paddle.fluid.layers.gather(x, index_float)
+
+            self.assertRaises(TypeError, test_index_type)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
new file mode 100644
index 0000000000000..d2729d77abaa7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
@@ -0,0 +1,149 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+class TestMLUReduceSumOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.set_mlu()
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].sum()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].sum(axis=self.axis,
+                                            keepdims=self.attrs['keep_dim'])
+            }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_op_type(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (0, )
+
+
+class TestSumOp5D(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (1, 2, 5, 6, 10)
+        self.axis = (0, )
+
+
+class TestSumOp6D(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (1, 1, 2, 5, 6, 10)
+        self.axis = (0, )
+
+
+class TestSumOp8D(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
+        self.axis = (0, 3)
+
+
+class Test1DReduce(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = 120
+        self.axis = (0, )
+
+
+class Test2DReduce0(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (20, 10)
+        self.axis = (0, )
+
+
+class Test2DReduce1(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (20, 10)
+        self.axis = (1, )
+
+
+class Test3DReduce0(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (1, )
+
+
+class Test3DReduce1(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (2, )
+
+
+class Test3DReduce2(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (-2, )
+
+
+class Test3DReduce3(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 7)
+        self.axis = (1, 2)
+
+
+class TestKeepDimReduce(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 10)
+        self.axis = (1, )
+        self.keep_dim = True
+
+
+class TestKeepDim8DReduce(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
+        self.axis = (3, 4, 5)
+        self.keep_dim = True
+
+
+class TestReduceAll(TestMLUReduceSumOp):
+    def initTestCase(self):
+        self.shape = (5, 6, 2, 10)
+        self.axis = (0, )
+        self.reduce_all = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py
new file mode 100644
index 0000000000000..0ed5eb7e8a9bc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze2_op_mlu.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import sys
+sys.path.append("..")
+
+import numpy as np
+
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.set_mlu()
+        self.op_type = "unsqueeze2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_test_case(self):
+        self.ori_shape = (3, 40)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1, )
+        self.new_shape = (20, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+# axes is a list(with tensor)
+class TestUnsqueezeOp_AxesTensorList(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.set_mlu()
+        self.op_type = "unsqueeze2"
+
+        axes_tensor_list = []
+        for index, ele in enumerate(self.axes):
+            axes_tensor_list.append(("axes" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "AxesTensorList": axes_tensor_list
+        }
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (1, 2)
+        self.new_shape = (20, 1, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestUnsqueezeOp1_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1, )
+        self.new_shape = (20, 5, 1)
+
+
+class TestUnsqueezeOp2_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+class TestUnsqueezeOp3_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+class TestUnsqueezeOp4_AxesTensorList(TestUnsqueezeOp_AxesTensorList):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+# axes is a Tensor
+class TestUnsqueezeOp_AxesTensor(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.set_mlu()
+        self.op_type = "unsqueeze2"
+
+        self.inputs = {
+            "X": np.random.random(self.ori_shape).astype("float32"),
+            "AxesTensor": np.array(self.axes).astype("int32")
+        }
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, no_check_set=['XShape'])
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (1, 2)
+        self.new_shape = (20, 1, 1, 5)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestUnsqueezeOp1_AxesTensor(TestUnsqueezeOp_AxesTensor):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1, )
+        self.new_shape = (20, 5, 1)
+
+
+class TestUnsqueezeOp2_AxesTensor(TestUnsqueezeOp_AxesTensor):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+class TestUnsqueezeOp3_AxesTensor(TestUnsqueezeOp_AxesTensor):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+class TestUnsqueezeOp4_AxesTensor(TestUnsqueezeOp_AxesTensor):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py
new file mode 100644
index 0000000000000..d75a2f4d21a28
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_unsqueeze_op_mlu.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import sys
+sys.path.append("..")
+
+import numpy as np
+
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.set_mlu()
+        self.op_type = "unsqueeze"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_test_case(self):
+        self.ori_shape = (3, 40)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1, )
+        self.new_shape = (20, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
index 03886ab8a147f..9be8a35f1ae1b 100644
--- a/python/paddle/fluid/tests/unittests/process_group_gloo.py
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -47,7 +47,8 @@ def test_create_process_group_gloo(self):
             is_master = True if rank == 0 else False
             store = paddle.fluid.core.TCPStore("127.0.0.1", 6272, is_master,
                                                nranks, datetime.timedelta(0))
-            pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks)
+            place = paddle.fluid.core.CPUPlace()
+            pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks, place)
 
             # test allreduce sum
             # rank 0
diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py
index 7ae38b3bbc4d2..7aa83ad907914 100644
--- a/python/paddle/fluid/tests/unittests/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py
@@ -185,6 +185,24 @@ def test_create_process_group_nccl(self):
             assert np.array_equal(tensor_y, out_2)
             print("test allgather api ok\n")
 
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                paddle.device.cuda.synchronize()
+            # rank 1
+            else:
+                tensor_out_list = []
+                task = dist.all_gather(
+                    tensor_out_list, tensor_y, use_calc_stream=False)
+                paddle.device.cuda.synchronize()
+                tensor_out = paddle.concat(tensor_out_list)
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            assert np.array_equal(tensor_x, out_1)
+            assert np.array_equal(tensor_y, out_2)
+            print("test allgather api2 ok\n")
+
             # test alltoall
             # rank 0
             x = np.random.random(self.shape).astype(self.dtype)
@@ -219,6 +237,38 @@ def test_create_process_group_nccl(self):
                 assert np.array_equal(out2_1, raw_tensor_x_2)
             print("test alltoall api ok\n")
 
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+            # rank 1
+            else:
+                in_1, in_2 = paddle.split(tensor_y, 2)
+                out_1, out_2 = paddle.split(tensor_out2, 2)
+                out_tensor_list = []
+                task = dist.alltoall([in_1, in_2], out_tensor_list)
+                paddle.device.cuda.synchronize()
+                tensor_out2 = paddle.concat(out_tensor_list)
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            if pg.rank() == 0:
+                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            else:
+                assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api2 ok\n")
+
             # test Reduce
             # rank 0
             x = np.random.random(self.shape).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
index c11fb3d1e28aa..cbcb4af926951 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -68,6 +68,26 @@ def initTestCase(self):
         self.axis = 0
 
 
+@unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                 "FP16 test runs only on GPU")
+class TestCase0FP16(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = np.float16
+        self.axis = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                 "FP16 test runs only on GPU")
+class TestCase1FP16(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4)
+        self.dtype = np.float16
+        self.axis = 1
+
+
 class TestCase2_1(BaseTestCase):
     def initTestCase(self):
         self.op_type = 'arg_max'
@@ -202,4 +222,5 @@ def setUp(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index f4423ccd0294c..37b1cfd02faf7 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -221,6 +221,8 @@ def test_clip_dygraph(self):
             paddle.cast(images * 10, 'int32'), min=2, max=8)
         out_5 = self._executed_api(
             paddle.cast(images * 10, 'int64'), min=2, max=8)
+        # test with numpy.generic
+        out_6 = self._executed_api(images, min=np.abs(0.2), max=np.abs(0.8))
 
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
@@ -229,6 +231,7 @@ def test_clip_dygraph(self):
             np.allclose(out_4.numpy(), (data * 10).astype(np.int32).clip(2, 8)))
         self.assertTrue(
             np.allclose(out_5.numpy(), (data * 10).astype(np.int64).clip(2, 8)))
+        self.assertTrue(np.allclose(out_6.numpy(), data.clip(0.2, 0.8)))
 
     def test_eager(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 7e78b223b3f6a..600a49b2332be 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -114,7 +114,7 @@ def constructor(self, place):
         egr_tensor = core.eager.Tensor()
         self.assertEqual(egr_tensor.persistable, False)
         self.assertTrue("generated" in egr_tensor.name)
-        self.assertEqual(egr_tensor.shape, [])
+        self.assertEqual(egr_tensor.shape, [0])
         self.assertEqual(egr_tensor.dtype, core.VarDesc.VarType.FP32)
         self.assertEqual(egr_tensor.stop_gradient, True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index fd46b41c5f07e..4932ea8a1b5c9 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -20,6 +20,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 
 # Situation 1: shape is a list(without tensor)
@@ -231,6 +232,42 @@ def test_api(self):
         assert np.array_equal(res_3, np.tile(input, (1, 1)))
 
 
+class TestExpandInferShape(unittest.TestCase):
+    def test_shape_with_var(self):
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(shape=[-1, 1, 3], name='x')
+            fake_var = paddle.randn([2, 3])
+            target_shape = [
+                -1, paddle.shape(fake_var)[0], paddle.shape(fake_var)[1]
+            ]
+            out = paddle.expand(x, shape=target_shape)
+            self.assertListEqual(list(out.shape), [-1, -1, -1])
+
+
+# Test python Dygraph API 
+class TestExpandV2DygraphAPI(unittest.TestCase):
+    def test_expand_times_is_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            with _test_eager_guard():
+                paddle.seed(1)
+                a = paddle.rand([2, 5])
+                egr_expand_1 = paddle.expand(a, shape=[2, 5])
+                np_array = np.array([2, 5])
+                egr_expand_2 = paddle.expand(a, shape=np_array)
+
+            paddle.seed(1)
+            a = paddle.rand([2, 5])
+            expand_1 = paddle.expand(a, shape=[2, 5])
+            np_array = np.array([2, 5])
+            expand_2 = paddle.expand(a, shape=np_array)
+
+            self.assertTrue(
+                np.array_equal(egr_expand_1.numpy(), egr_expand_2.numpy()))
+            self.assertTrue(np.array_equal(expand_1.numpy(), expand_2.numpy()))
+            self.assertTrue(
+                np.array_equal(expand_1.numpy(), egr_expand_1.numpy()))
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 728e178845c9b..07f3eaa04ad2e 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 from op_test import OpTest
+import paddle.fluid.core as core
 
 
 def quantize_max_abs(x, max_range):
@@ -76,22 +77,25 @@ def channel_wise_dequantize_max_abs(x,
 class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
     def set_args(self):
         self.quant_bits = [8, 8]
-        self.data_type = "float32"
         self.activation_scale = 0.7861
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def setUp(self):
         self.set_args()
+        self.set_dtype()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
-        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        x = np.random.randn(4, 3, 64, 64).astype(self.dtype)
         yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0], 1)
         ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits, 1,
                                               self.activation_scale)
 
         self.inputs = {
             'X': yq,
-            'Scales': [("scales0", np.array(scales).astype(self.data_type)),
-                       ("scales1", np.array(
-                           [self.activation_scale]).astype(self.data_type))]
+            'Scales': [("scales0", np.array(scales).astype(self.dtype)),
+                       ("scales1",
+                        np.array([self.activation_scale]).astype(self.dtype))]
         }
         self.attrs = {'quant_bits': self.quant_bits}
         self.outputs = {'Out': ydq}
@@ -100,16 +104,28 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestFakeChannelWiseDequantizeMaxAbsOpTwoScalesFloat16(
+        TestFakeChannelWiseDequantizeMaxAbsOpTwoScales):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+
 class TestFakeChannelWiseDequantizeMaxAbsOpOneScale(OpTest):
     def set_args(self):
         self.quant_bits = [8]
-        self.data_type = "float32"
         self.quant_axis = 0
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def setUp(self):
         self.set_args()
+        self.set_dtype()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
-        x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
+        x = np.random.randn(4, 3, 64, 64).astype(self.dtype)
         yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0],
                                                    self.quant_axis)
         ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
@@ -117,7 +133,7 @@ def setUp(self):
 
         self.inputs = {
             'X': yq,
-            'Scales': [("scales0", np.array(scales).astype(self.data_type))]
+            'Scales': [("scales0", np.array(scales).astype(self.dtype))]
         }
         self.attrs = {
             'quant_bits': self.quant_bits,
@@ -133,24 +149,44 @@ class TestFakeChannelWiseDequantizeMaxAbsOpOneScale1(
         TestFakeChannelWiseDequantizeMaxAbsOpOneScale):
     def set_args(self):
         self.quant_bits = [8]
-        self.data_type = "float32"
         self.quant_axis = 1
 
 
+class TestFakeChannelWiseDequantizeMaxAbsOpOneScaleFloat16(
+        TestFakeChannelWiseDequantizeMaxAbsOpOneScale):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+
+class TestFakeChannelWiseDequantizeMaxAbsOpOneScale1Float16(
+        TestFakeChannelWiseDequantizeMaxAbsOpOneScale1):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+
 class TestFakeDequantizeMaxAbsOp(OpTest):
     def set_args(self):
         self.num_bits = 8
         self.max_range = math.pow(2, self.num_bits - 1) - 1
-        self.data_type = "float32"
+
+    def set_dtype(self):
+        self.dtype = np.float32
 
     def setUp(self):
         self.set_args()
+        self.set_dtype()
         self.op_type = "fake_dequantize_max_abs"
-        x = np.random.randn(31, 65).astype(self.data_type)
+        x = np.random.randn(31, 65).astype(self.dtype)
         yq, scale = quantize_max_abs(x, self.max_range)
         ydq = dequantize_max_abs(yq, scale, self.max_range)
 
-        self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.data_type)}
+        self.inputs = {'X': yq, 'Scale': np.array(scale).astype(self.dtype)}
         self.attrs = {'max_range': self.max_range}
         self.outputs = {'Out': ydq}
 
@@ -159,17 +195,22 @@ def test_check_output(self):
 
 
 class TestFakeDequantizeMaxAbsOpDouble(TestFakeDequantizeMaxAbsOp):
-    def set_args(self):
-        self.num_bits = 8
-        self.max_range = math.pow(2, self.num_bits - 1) - 1
-        self.data_type = "float64"
+    def set_dtype(self):
+        self.dtype = np.float64
 
 
 class TestFakeDequantizeMaxAbsOp5Bits(TestFakeDequantizeMaxAbsOp):
     def set_args(self):
         self.num_bits = 5
         self.max_range = math.pow(2, self.num_bits - 1) - 1
-        self.data_type = "float32"
+
+
+class TestFakeDequantizeMaxAbsOpFloat16(TestFakeDequantizeMaxAbsOp):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
 
 
 class TestChannelWiseDequantizeOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 2be61d1218560..230bc15e0f1ab 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -15,28 +15,51 @@
 from __future__ import print_function
 
 import unittest
+import math
 import numpy as np
 import math
 from op_test import OpTest
 import paddle.fluid.core as core
 
 
+# numpy.round has different behavior in comparision to c++ round function
+# so we use round_c instead of numpy.round to align the output data
+def round_c_single_element(x):
+    dtype = type(x)
+    if x >= 0:
+        return dtype(np.floor(x + 0.5))
+    else:
+        return dtype(np.ceil(x - 0.5))
+
+
+round_c = np.vectorize(round_c_single_element)
+
+
 class TestFakeQuantizeOp(OpTest):
     def setUp(self):
+        self.set_dtype()
         self.op_type = "fake_quantize_abs_max"
         self.attrs = {'bit_length': 8}
-        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        self.inputs = {'X': np.random.random((124, 240)).astype(self.dtype), }
+        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
         self.outputs = {
-            'Out': np.round(self.inputs['X'] / scale * (
+            'Out': round_c(self.inputs['X'] / scale * (
                 (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScale': np.array(scale).astype("float32"),
+            'OutScale': np.array(scale).astype(self.dtype),
         }
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeQuantizeOpFloat16(TestFakeQuantizeOp):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+
 class TestFakeQuantizeOp1(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_abs_max"
@@ -73,6 +96,7 @@ def test_check_output(self):
 
 class TestFakeChannelWiseQuantizeOp(OpTest):
     def setUp(self):
+        self.set_dtype()
         self.set_arg()
         assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
 
@@ -84,53 +108,70 @@ def setUp(self):
         bnt = (1 << (self.attrs['bit_length'] - 1)) - 1
         if self.quant_axis == 0:
             for i in range(self.inputs['X'].shape[0]):
-                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
+                scale_v = np.max(np.abs(self.inputs['X'][i])).astype(self.dtype)
                 scales.append(scale_v)
-                outputs[i] = np.round(outputs[i] / scale_v * bnt)
+                outputs[i] = round_c(
+                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) * outputs[i])
         elif self.quant_axis == 1:
             for i in range(self.inputs['X'].shape[1]):
                 scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
-                    "float32")
+                    self.dtype)
                 scales.append(scale_v)
-                outputs[:, i] = np.round(outputs[:, i] / scale_v * bnt)
+                outputs[:, i] = round_c(
+                    self.dtype(bnt) * (self.dtype(1.0) / scale_v) *
+                    outputs[:, i])
 
         self.outputs = {
             'Out': outputs,
-            'OutScale': np.array(scales).astype("float32"),
+            'OutScale': np.array(scales).astype(self.dtype),
         }
 
     def set_arg(self):
         self.quant_axis = 0
         self.inputs = {
-            'X': np.random.random((20, 15, 6, 6)).astype("float32"),
+            'X': np.random.random((20, 15, 6, 6)).astype(self.dtype),
         }
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeChannelWiseQuantizeOpFloat16(TestFakeChannelWiseQuantizeOp):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+
 class TestFakeChannelWiseQuantizeOp1(TestFakeChannelWiseQuantizeOp):
     def set_quant_axis(self):
         self.quant_axis = 1
         self.inputs = {
-            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
+            'X': np.random.random((15, 20, 5, 5)).astype(self.dtype),
         }
 
 
+class TestFakeChannelWiseQuantizeOp1Float16(TestFakeChannelWiseQuantizeOp1):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+
 class TestFakeChannelWiseQuantizeOp2(TestFakeChannelWiseQuantizeOp):
     def set_quant_axis(self):
         self.quant_axis = 0
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
 
 
 class TestFakeChannelWiseQuantizeOp3(TestFakeChannelWiseQuantizeOp):
     def set_quant_axis(self):
         self.quant_axis = 1
-        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+        self.inputs = {'X': np.random.random((30, 15)).astype(self.dtype), }
 
 
 class TestFakeQuantizeRangeAbsMaxOp(OpTest):
     def setUp(self):
+        self.set_dtype()
         self.op_type = "fake_quantize_range_abs_max"
         self.attrs = {
             'bit_length': int(5),
@@ -138,27 +179,36 @@ def setUp(self):
             'is_test': False
         }
         x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype("float32")
+        x = x.astype(self.dtype)
         self.inputs = {
             'X': x,
             'Iter': np.zeros(1).astype("int64"),
-            'InScale': np.zeros(1).astype("float32")
+            'InScale': np.zeros(1).astype(self.dtype)
         }
-        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        scale = np.max(np.abs(self.inputs['X'])).astype(self.dtype)
 
-        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
         out_scales[0] = scale
         self.outputs = {
-            'Out': np.round(self.inputs['X'] / scale * (
-                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'Out': round_c(
+                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) *
+                (self.dtype(1.0) / scale) * self.inputs['X']),
             'OutScale': scale,
             'OutScales': out_scales,
         }
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         self.check_output()
 
 
+class TestFakeQuantizeRangeAbsMaxOpFloat16(TestFakeQuantizeRangeAbsMaxOp):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+
 class TestMovingAverageAbsMaxScaleOp(OpTest):
     def setUp(self):
         self.op_type = "moving_average_abs_max_scale"
@@ -195,6 +245,7 @@ def test_check_output(self):
 
 class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
     def setUp(self):
+        self.set_dtype()
         self.op_type = "fake_quantize_range_abs_max"
         self.attrs = {
             'bit_length': int(8),
@@ -202,55 +253,68 @@ def setUp(self):
             'is_test': True
         }
         x = (np.random.random((8, 16, 7, 7)) - 0.5) * 10
-        x = x.astype("float32")
-        scale = np.array([np.max(np.abs(x)).astype("float32") - 1.0])
-        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
-        out_scales[0] = scale
+        x = x.astype(self.dtype)
+        scale = np.array([np.max(np.abs(x)).astype(self.dtype) - 1.0])
+        out_scales = np.zeros(self.attrs['window_size']).astype(self.dtype)
+        out_scales[0] = scale.astype(self.dtype)
         self.inputs = {
             'X': x,
             'Iter': np.zeros(1).astype("int64"),
-            'InScale': scale.astype("float32")
+            'InScale': scale.astype(self.dtype)
         }
-        xs = np.clip(x, -scale, scale)
-        qs = np.round(xs / scale * ((1 << (self.attrs['bit_length'] - 1)) - 1))
+        xs = np.clip(x, -scale, scale).astype(self.dtype)
+        qs = round_c(
+            self.dtype(
+                self.dtype((1 << (self.attrs['bit_length'] - 1)) - 1) * (
+                    self.dtype(1.0) / scale) * xs))
         self.outputs = {
             'Out': qs,
-            'OutScale': scale.astype("float32"),
+            'OutScale': scale.astype(self.dtype),
             'OutScales': out_scales,
         }
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def test_check_output(self):
         self.check_output(no_check_set=set(['OutScale', 'OutScales']))
 
 
+class TestFakeQuantizeRangeAbsMaxOp2Float16(TestFakeQuantizeRangeAbsMaxOp2):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+
 class TestMovingOpBase(OpTest):
     def setUp(self):
+        self.set_dtype()
         self.init_type()
         self.attrs = {
             'bit_length': int(5),
             'moving_rate': float(0.9),
             'is_test': False
         }
-        accum = np.zeros(1).astype("float32")
+        accum = np.zeros(1).astype(self.dtype)
         accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        scale = np.zeros(1).astype("float32")
+        state = np.zeros(1).astype(self.dtype)
+        state[0] = self.dtype(1.0)
+        scale = np.zeros(1).astype(self.dtype)
         scale[0] = 0.001
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': np.random.random((8, 16, 7, 7)).astype(self.dtype),
             'InScale': scale,
             'InAccum': accum,
             'InState': state,
         }
 
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
-        out_scale = out_accum / out_state
+        out_accum = np.zeros(1).astype(self.dtype)
+        out_state = np.zeros(1).astype(self.dtype)
+        out_scale = np.zeros(1).astype(self.dtype)
+        out_accum[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(accum[
+            0]) + np.max(np.abs(self.inputs['X'])).astype(self.dtype)
+        out_state[0] = self.dtype(self.attrs['moving_rate']) * self.dtype(state[
+            0]) + self.dtype(1.0)
+        out_scale = self.dtype(self.dtype(out_accum) / self.dtype(out_state))
         out_data = self.calc_output(out_scale)
         self.outputs = {
             'Out': out_data,
@@ -259,17 +323,28 @@ def setUp(self):
             'OutScale': out_scale,
         }
 
+    def set_dtype(self):
+        self.dtype = np.float32
+
     def init_type(self):
         self.op_type = "fake_quantize_moving_average_abs_max"
 
     def calc_output(self, out_scale):
-        return np.round(self.inputs['X'] / out_scale * (
+        return round_c(self.inputs['X'] / out_scale * (
             (1 << (self.attrs['bit_length'] - 1)) - 1))
 
     def test_check_output(self):
         self.check_output()
 
 
+class TestMovingOpBaseFloat16(TestMovingOpBase):
+    def set_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+
 class TestFakeQuantDequantMovingOp(TestMovingOpBase):
     def init_type(self):
         self.op_type = "fake_quantize_dequantize_moving_average_abs_max"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 31f77f17b19bf..ffc3f2b21a476 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -310,9 +310,22 @@ def test_fleet_desc_configs(self):
             "embed_sparse_optimizer": "std_adagrad"
         }
         strategy.fleet_desc_configs = configs
+        self.assertEqual(strategy.sparse_table_configs[0]
+                         .accessor.ctr_accessor_param.show_scale, False)
         self.assertEqual(strategy.sparse_table_configs[0]
                          .accessor.embed_sgd_param.adagrad.initial_range, 0)
 
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {}
+        configs['emb'] = {
+            "sparse_accessor_class": "DownpourCtrDoubleAccessor",
+            "embed_sparse_optimizer": "std_adagrad"
+        }
+        strategy.fleet_desc_configs = configs
+        self.assertEqual(strategy.sparse_table_configs[0]
+                         .accessor.embed_sgd_param.adagrad.initial_range,
+                         0.0001)
+
     def test_trainer_desc_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
index 05a310a9c5033..d3fea677a47d9 100644
--- a/python/paddle/fluid/tests/unittests/test_full_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -22,6 +22,7 @@
 import numpy as np
 from op_test import OpTest
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestFullOp(unittest.TestCase):
@@ -133,5 +134,19 @@ def init_data(self):
         self.dtype = np.int64
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFullLikeOp4(unittest.TestCase):
+    def test_skip_data_transform(self):
+        paddle.disable_static()
+        with _test_eager_guard():
+            x = paddle.to_tensor(
+                [1., 2., 3., 4.], place=paddle.CUDAPinnedPlace())
+            out = paddle.full_like(x, 1.)
+            self.assertTrue(
+                (out.numpy() == np.ones([4]).astype(np.float32)).all(), True)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 2442f2b681554..108469cf8a732 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -108,6 +108,9 @@ def test_api_eager(self):
                     shape=[1], dtype=np.float32, value=1.1)
                 out_7 = paddle.full(
                     shape=[1, 2], dtype=np.float32, fill_value=val)
+                # test for numpy.float64 as fill_value
+                out_8 = paddle.full_like(
+                    out_7, dtype=np.float32, fill_value=np.abs(1.1))
 
                 assert np.array_equal(
                     out_1, np.full(
@@ -130,6 +133,9 @@ def test_api_eager(self):
                 assert np.array_equal(
                     out_7, np.full(
                         [1, 2], 1.1, dtype="float32"))
+                assert np.array_equal(
+                    out_8, np.full(
+                        [1, 2], 1.1, dtype="float32"))
 
 
 class TestFullOpError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py b/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py
index d2fbeab3fd42c..675a3429ab55f 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_sample_neighbors.py
@@ -162,14 +162,14 @@ def check_perm_buffer_error():
         self.assertRaises(ValueError, check_perm_buffer_error)
 
     def test_sample_result_with_eids(self):
-        # Note: Currently return eid results is not initialized.
         paddle.disable_static()
         row = paddle.to_tensor(self.row)
         colptr = paddle.to_tensor(self.colptr)
         nodes = paddle.to_tensor(self.nodes)
         eids = paddle.to_tensor(self.edges_id)
+        perm_buffer = paddle.to_tensor(self.edges_id)
 
-        out_neighbors, out_count, _ = paddle.incubate.graph_sample_neighbors(
+        out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors(
             row,
             colptr,
             nodes,
@@ -177,6 +177,16 @@ def test_sample_result_with_eids(self):
             sample_size=self.sample_size,
             return_eids=True)
 
+        out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors(
+            row,
+            colptr,
+            nodes,
+            eids=eids,
+            perm_buffer=perm_buffer,
+            sample_size=self.sample_size,
+            return_eids=True,
+            flag_perm_buffer=True)
+
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
             row = paddle.static.data(
@@ -188,7 +198,7 @@ def test_sample_result_with_eids(self):
             eids = paddle.static.data(
                 name="eids", shape=self.edges_id.shape, dtype=self.nodes.dtype)
 
-            out_neighbors, out_count, _ = paddle.incubate.graph_sample_neighbors(
+            out_neighbors, out_count, out_eids = paddle.incubate.graph_sample_neighbors(
                 row,
                 colptr,
                 nodes,
@@ -202,7 +212,7 @@ def test_sample_result_with_eids(self):
                 'nodes': self.nodes,
                 'eids': self.edges_id
             },
-                          fetch_list=[out_neighbors, out_count])
+                          fetch_list=[out_neighbors, out_count, out_eids])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 6c5864cfebc93..676f35838ad33 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3661,9 +3661,7 @@ def make_scale_variable(self):
                 shape=[1],
                 dtype='float32',
                 append_batch_size=False)
-            _scale = scale_var.numpy().item(0) if isinstance(
-                scale_var, core.eager.Tensor) else scale_var
-            out = layers.scale(input, scale=_scale)
+            out = layers.scale(input, scale=scale_var)
             return out
 
     def make_softshrink(self):
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
new file mode 100644
index 0000000000000..c71ff4381028d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy
+import paddle.nn.functional as F
+
+
+class SimpleNet(paddle.nn.Layer):
+    def __init__(self, data_format="NCHW", class_num=2):
+        super(SimpleNet, self).__init__()
+        self.conv = paddle.nn.Conv2D(3, 8, (3, 3))
+        self.bn = paddle.nn.BatchNorm(num_channels=8)
+        self.relu = paddle.nn.ReLU()
+        self.pool = paddle.nn.AvgPool2D(kernel_size=2, stride=2)
+        self.flatten = paddle.nn.Flatten()
+        self.fc = paddle.nn.Linear(392, class_num)
+
+    def forward(self, image):
+        conv_out = self.conv(image)
+        bn_out = self.bn(conv_out)
+        out = self.relu(bn_out)
+        out = self.pool(out)
+        out = self.flatten(out)
+        out = self.fc(out)
+        return conv_out, out
+
+
+class LayoutAutoTune(unittest.TestCase):
+    def use_autoune(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.fluid.core.enable_layout_autotune()
+            return paddle.fluid.core.use_layout_autotune()
+        else:
+            paddle.fluid.core.disable_layout_autotune()
+            return paddle.fluid.core.use_layout_autotune()
+
+    def train(self, data_format):
+        model = SimpleNet(data_format="NCHW", class_num=2)
+        data = paddle.rand([1, 3, 16, 16])
+        if (data_format == "NHWC"):
+            data = paddle.rand([1, 16, 16, 3])
+        label_data = paddle.randint(0, 1, shape=[1, 1], dtype="int64")
+        optimizer = paddle.optimizer.SGD(learning_rate=0.0001,
+                                         parameters=model.parameters())
+        scaler = paddle.amp.GradScaler()
+        for i in range(2):
+            with paddle.amp.auto_cast(level="O2"):
+                conv_out, predict = model(data)
+                loss = F.cross_entropy(predict, label=label_data)
+                loss = loss.mean()
+
+            scaled = scaler.scale(loss)
+            scaled.backward()
+            scaler.minimize(optimizer, scaled)
+        return conv_out, predict
+
+    def test_enable_autotune(self):
+        if self.use_autoune():
+            conv_out, predict = self.train(data_format="NCHW")
+            self.assertEqual(conv_out.shape, [1, 14, 14, 8])
+            self.assertEqual(predict.shape, [1, 2])
+        else:
+            conv_out, predict = self.train(data_format="NCHW")
+            self.assertEqual(conv_out.shape, [1, 8, 14, 14])
+            self.assertEqual(predict.shape, [1, 2])
+
+    def test_transpose_op_transposer(self):
+        if not self.use_autoune():
+            return
+        conv = paddle.nn.Conv2D(3, 8, (3, 3))
+        data = paddle.rand([1, 3, 16, 14])
+        label_data = paddle.randint(0, 1, shape=[1, 1], dtype="int64")
+        optimizer = paddle.optimizer.SGD(learning_rate=0.0001,
+                                         parameters=conv.parameters())
+        scaler = paddle.amp.GradScaler()
+        with paddle.amp.auto_cast(level="O2"):
+            conv_out = conv(data)
+            # conv_out.shape = [1, 14, 12, 8] with NHWC
+            # layout tuner will transpose conv_out to 
+            # [1, 8, 14, 12] with NCHW before the following transpose op.
+            out = paddle.transpose(conv_out, perm=[0, 3, 1, 2])
+            loss = out.mean()
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(optimizer, scaled)
+
+        self.assertEqual(conv_out.shape, [1, 14, 12, 8])
+        self.assertEqual(out.shape, [1, 12, 8, 14])
+
+    def test_flatten_op_transposer(self):
+        if not self.use_autoune():
+            return
+        paddle.fluid.core.enable_layout_autotune()
+        conv = paddle.nn.Conv2D(3, 8, (3, 3))
+        flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
+        data = paddle.rand([1, 3, 16, 14])
+        with paddle.amp.auto_cast(level="O2"):
+            conv_out = conv(data)
+            # conv_out.shape = [1, 14, 12, 8] with NHWC
+            # layout tuner will transpose conv_out to
+            # [1, 8, 14, 12] with NCHW before the following flatten op
+            # because it flatten the C and H dimensions.
+            out = flatten(conv_out)
+
+        self.assertEqual(conv_out.shape, [1, 14, 12, 8])
+        self.assertEqual(out.shape, [1, 112, 12])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logspace.py b/python/paddle/fluid/tests/unittests/test_logspace.py
new file mode 100644
index 0000000000000..ffa9885e7671e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_logspace.py
@@ -0,0 +1,231 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+
+
+class TestLogspaceOpCommonCase(OpTest):
+    def setUp(self):
+        self.op_type = "logspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([0]).astype(dtype),
+            'Stop': np.array([10]).astype(dtype),
+            'Num': np.array([11]).astype('int32'),
+            'Base': np.array([2]).astype(dtype),
+        }
+        self.attrs = {'dtype': int(paddle.float32)}
+
+        self.outputs = {'Out': np.power(2, np.arange(0, 11)).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLogspaceOpReverseCase(OpTest):
+    def setUp(self):
+        self.op_type = "logspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([11]).astype('int32'),
+            'Base': np.array([2]).astype(dtype)
+        }
+        self.attrs = {'dtype': int(paddle.float32)}
+
+        self.outputs = {'Out': np.power(2, np.arange(10, -1, -1)).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLogspaceOpNumOneCase(OpTest):
+    def setUp(self):
+        self.op_type = "logspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([1]).astype('int32'),
+            'Base': np.array([2]).astype(dtype)
+        }
+        self.attrs = {'dtype': int(paddle.float32)}
+
+        self.outputs = {'Out': np.power(2, np.array(10)).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLogspaceOpMinusBaseCase(OpTest):
+    def setUp(self):
+        self.op_type = "logspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([0]).astype(dtype),
+            'Stop': np.array([10]).astype(dtype),
+            'Num': np.array([11]).astype('int32'),
+            'Base': np.array([-2]).astype(dtype),
+        }
+        self.attrs = {'dtype': int(paddle.float32)}
+
+        self.outputs = {'Out': np.power(-2, np.arange(0, 11)).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLogspaceOpZeroBaseCase(OpTest):
+    def setUp(self):
+        self.op_type = "logspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([0]).astype(dtype),
+            'Stop': np.array([10]).astype(dtype),
+            'Num': np.array([11]).astype('int32'),
+            'Base': np.array([0]).astype(dtype),
+        }
+        self.attrs = {'dtype': int(paddle.float32)}
+
+        self.outputs = {'Out': np.power(0, np.arange(0, 11)).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLogspaceAPI(unittest.TestCase):
+    def test_variable_input1(self):
+        paddle.enable_static()
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+            stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+            num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+            base = paddle.full(shape=[1], fill_value=2, dtype='float32')
+            out = paddle.logspace(start, stop, num, base, dtype='float32')
+
+        exe = paddle.static.Executor()
+        res = exe.run(prog, fetch_list=[out])
+        np_res = np.logspace(0, 10, 5, base=2, dtype='float32')
+        self.assertEqual((res == np_res).all(), True)
+        paddle.disable_static()
+
+    def test_variable_input2(self):
+        paddle.disable_static()
+        start = paddle.full(shape=[1], fill_value=0, dtype='float32')
+        stop = paddle.full(shape=[1], fill_value=10, dtype='float32')
+        num = paddle.full(shape=[1], fill_value=5, dtype='int32')
+        base = paddle.full(shape=[1], fill_value=2, dtype='float32')
+        out = paddle.logspace(start, stop, num, base, dtype='float32')
+        np_res = np.logspace(0, 10, 5, base=2, dtype='float32')
+        self.assertEqual((out.numpy() == np_res).all(), True)
+        paddle.enable_static()
+
+    def test_dtype(self):
+        paddle.enable_static()
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            out_1 = paddle.logspace(0, 10, 5, 2, dtype='float32')
+            out_2 = paddle.logspace(0, 10, 5, 2, dtype=np.float32)
+
+        exe = paddle.static.Executor()
+        res_1, res_2 = exe.run(prog, fetch_list=[out_1, out_2])
+        assert np.array_equal(res_1, res_2)
+        paddle.disable_static()
+
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            out = paddle.logspace(
+                0, 10, 5, 2, dtype='float32', name='logspace_res')
+            assert 'logspace_res' in out.name
+
+    def test_imperative(self):
+        paddle.disable_static()
+        out1 = paddle.logspace(0, 10, 5, 2, dtype='float32')
+        np_out1 = np.logspace(0, 10, 5, base=2, dtype='float32')
+        out2 = paddle.logspace(0, 10, 5, 2, dtype='int32')
+        np_out2 = np.logspace(0, 10, 5, base=2, dtype='int32')
+        out3 = paddle.logspace(0, 10, 200, 2, dtype='int32')
+        np_out3 = np.logspace(0, 10, 200, base=2, dtype='int32')
+        paddle.enable_static()
+        self.assertEqual((out1.numpy() == np_out1).all(), True)
+        self.assertEqual((out2.numpy() == np_out2).all(), True)
+        self.assertEqual((out3.numpy() == np_out3).all(), True)
+
+
+class TestLogspaceOpError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            def test_dtype():
+                paddle.logspace(0, 10, 1, 2, dtype="int8")
+
+            self.assertRaises(TypeError, test_dtype)
+
+            def test_dtype1():
+                paddle.logspace(0, 10, 1.33, 2, dtype="int32")
+
+            self.assertRaises(TypeError, test_dtype1)
+
+            def test_start_type():
+                paddle.logspace([0], 10, 1, 2, dtype="float32")
+
+            self.assertRaises(TypeError, test_start_type)
+
+            def test_end_type():
+                paddle.logspace(0, [10], 1, 2, dtype="float32")
+
+            self.assertRaises(TypeError, test_end_type)
+
+            def test_num_type():
+                paddle.logspace(0, 10, [0], 2, dtype="float32")
+
+            self.assertRaises(TypeError, test_num_type)
+
+            def test_start_dtype():
+                start = paddle.static.data(
+                    shape=[1], dtype="float64", name="start")
+                paddle.logspace(start, 10, 1, 2, dtype="float32")
+
+            self.assertRaises(ValueError, test_start_dtype)
+
+            def test_end_dtype():
+                end = paddle.static.data(shape=[1], dtype="float64", name="end")
+                paddle.logspace(0, end, 1, 2, dtype="float32")
+
+            self.assertRaises(ValueError, test_end_dtype)
+
+            def test_num_dtype():
+                num = paddle.static.data(
+                    shape=[1], dtype="float32", name="step")
+                paddle.logspace(0, 10, num, 2, dtype="float32")
+
+            self.assertRaises(TypeError, test_num_dtype)
+
+            def test_base_dtype():
+                base = paddle.static.data(
+                    shape=[1], dtype="float64", name="end")
+                paddle.logspace(0, 10, 1, base, dtype="float32")
+
+            self.assertRaises(ValueError, test_base_dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index ac2b205e61128..ae804f82b90f7 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -20,6 +20,7 @@
 
 import paddle
 import paddle.profiler as profiler
+import paddle.profiler.utils as utils
 import paddle.nn as nn
 import paddle.nn.functional as F
 from paddle.io import Dataset, DataLoader
@@ -40,11 +41,17 @@ def my_trace_back(prof):
         with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof:
             y = x / 2.0
         prof = None
+        self.assertEqual(utils._is_profiler_used, False)
+        with profiler.RecordEvent(name='test'):
+            y = x / 2.0
+
         with profiler.Profiler(
                 targets=[profiler.ProfilerTarget.CPU],
                 scheduler=(1, 2)) as prof:
+            self.assertEqual(utils._is_profiler_used, True)
             with profiler.RecordEvent(name='test'):
                 y = x / 2.0
+
         prof = None
         with profiler.Profiler(
                 targets=[profiler.ProfilerTarget.CPU],
diff --git a/python/paddle/fluid/tests/unittests/test_pad3d_op.py b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
index 12f6f7b572108..eabff5f0021c5 100644
--- a/python/paddle/fluid/tests/unittests/test_pad3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad3d_op.py
@@ -27,7 +27,6 @@ class TestPad3dOp(OpTest):
     def setUp(self):
         paddle.enable_static()
         self.value = 0.0
-        self.variable_paddings = False
         self.initTestCase()
         self.op_type = "pad3d"
         self.python_api = paddle.nn.functional.pad
@@ -84,6 +83,7 @@ def initTestCase(self):
         self.mode = "constant"
         self.data_format = "NCDHW"
         self.pad_value = 0.0
+        self.variable_paddings = False
 
 
 class TestCase1(TestPad3dOp):
@@ -93,6 +93,7 @@ def initTestCase(self):
         self.mode = "constant"
         self.data_format = "NCDHW"
         self.value = 1.0
+        self.variable_paddings = False
 
 
 class TestCase2(TestPad3dOp):
@@ -102,6 +103,7 @@ def initTestCase(self):
         self.mode = "constant"
         self.data_format = "NDHWC"
         self.value = 1.0
+        self.variable_paddings = False
 
 
 class TestCase3(TestPad3dOp):
@@ -110,6 +112,7 @@ def initTestCase(self):
         self.paddings = [0, 1, 1, 0, 2, 3]
         self.mode = "reflect"
         self.data_format = "NCDHW"
+        self.variable_paddings = False
 
 
 class TestCase4(TestPad3dOp):
@@ -118,6 +121,7 @@ def initTestCase(self):
         self.paddings = [0, 1, 2, 1, 2, 3]
         self.mode = "reflect"
         self.data_format = "NDHWC"
+        self.variable_paddings = False
 
 
 class TestCase5(TestPad3dOp):
@@ -126,6 +130,7 @@ def initTestCase(self):
         self.paddings = [0, 1, 2, 3, 2, 1]
         self.mode = "replicate"
         self.data_format = "NCDHW"
+        self.variable_paddings = False
 
 
 class TestCase6(TestPad3dOp):
@@ -134,6 +139,7 @@ def initTestCase(self):
         self.paddings = [5, 4, 2, 1, 2, 3]
         self.mode = "replicate"
         self.data_format = "NDHWC"
+        self.variable_paddings = False
 
 
 class TestCase7(TestPad3dOp):
@@ -142,6 +148,7 @@ def initTestCase(self):
         self.paddings = [0, 1, 2, 3, 2, 1]
         self.mode = "circular"
         self.data_format = "NCDHW"
+        self.variable_paddings = False
 
 
 class TestCase8(TestPad3dOp):
@@ -150,6 +157,27 @@ def initTestCase(self):
         self.paddings = [0, 1, 2, 1, 2, 3]
         self.mode = "circular"
         self.data_format = "NDHWC"
+        self.variable_paddings = False
+
+
+class TestCase9(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 4, 5]
+        self.mode = "constant"
+        self.data_format = "NCDHW"
+        self.value = 1.0
+        self.variable_paddings = True
+
+
+class TestCase10(TestPad3dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.paddings = [0, 1, 2, 3, 4, 5]
+        self.mode = "constant"
+        self.data_format = "NDHWC"
+        self.value = 1.0
+        self.variable_paddings = True
 
 
 class TestPadAPI(unittest.TestCase):
@@ -681,6 +709,30 @@ def test_class(self):
                 input_data, pad, "circular", data_format="NCDHW")
             self.assertTrue(np.allclose(output.numpy(), np_out))
 
+    def test_pad_tensor(self):
+        paddle.disable_static()
+        for place in self.places:
+            input_shape = (3, 4, 5, 6, 7)
+            pad = [1, 2, 2, 1, 1, 0]
+            pad_tensor = paddle.to_tensor(pad)
+            input_data = np.random.rand(*input_shape).astype(np.float32)
+
+            pad_reflection_ncdhw = nn.Pad3D(
+                padding=pad_tensor, mode="reflect", data_format="NCDHW")
+            pad_reflection_ndhwc = nn.Pad3D(
+                padding=pad_tensor, mode="reflect", data_format="NDHWC")
+            data = paddle.to_tensor(input_data)
+
+            output = pad_reflection_ncdhw(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NCDHW")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
+            output = pad_reflection_ndhwc(data)
+            np_out = self._get_numpy_out(
+                input_data, pad, "reflect", data_format="NDHWC")
+            self.assertTrue(np.allclose(output.numpy(), np_out))
+
 
 class TestPad3dOpError(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
index 71c254dabb9e1..7f7db930d4c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
@@ -16,6 +16,7 @@
 
 import unittest
 import paddle.fluid as fluid
+import os
 
 from test_parallel_dygraph_dataparallel import TestMultipleGpus
 
@@ -23,31 +24,43 @@
 class TestHybridPipeParallel(TestMultipleGpus):
     def test_hybrid_parallel_pp_layer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py', eager_mode=False)
 
     def test_hybrid_parallel_pp_tuple_inputs(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py', eager_mode=False)
 
     def test_hybrid_parallel_shared_weight(self):
         self.run_mnist_2gpu('hybrid_parallel_shared_weight.py')
+        self.run_mnist_2gpu(
+            'hybrid_parallel_shared_weight.py', eager_mode=False)
 
     def test_pipeline_parallel_amp(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_amp.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_amp.py', eager_mode=False)
 
     def test_pipeline_parallel_fp16(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py', eager_mode=False)
 
     def test_hybrid_parallel_transformer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py')
+        self.run_mnist_2gpu(
+            'hybrid_parallel_pp_transformer.py', eager_mode=False)
 
     def test_hybrid_parallel_save_load(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py', eager_mode=False)
 
     def test_hybrid_parallel_recompute(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py', eager_mode=False)
 
     def test_hybrid_parallel_pp_clip_grad(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py')
+        self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
index fda6dc06309c5..14a291627843e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
@@ -23,7 +23,7 @@
 
 class TestHybridParallel(TestMultipleGpus):
     def test_hybrid_parallel_mp_random(self):
-        # self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
+        self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
         self.run_mnist_2gpu('hybrid_parallel_mp_random.py', eager_mode=False)
 
     def test_hybrid_parallel_mp_model(self):
diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
index dc944e68c7f55..7079d9678b2fd 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py
@@ -51,8 +51,9 @@ def test_statistic_case1(self):
         profilerstep_node = HostPythonNode('ProfileStep#1',
                                            profiler.TracerEventType.ProfileStep,
                                            0, 400, 1000, 1001)
-        dataloader_node = HostPythonNode(
-            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        dataloader_node = HostPythonNode('Dataloader',
+                                         profiler.TracerEventType.Dataloader, 5,
+                                         15, 1000, 1001)
         mobilenet_node = HostPythonNode(
             'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
         yolonet_node = HostPythonNode(
@@ -155,7 +156,7 @@ def test_statistic_case1(self):
                 profiler.TracerEventType.ProfileStep), 400)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.Forward), 100)
+                profiler.TracerEventType.Forward), 90)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Backward), 80)
@@ -185,12 +186,12 @@ def test_statistic_case1(self):
                 profiler.TracerEventType.Communication), 5)
         self.assertEqual(len(event_summary.items), 2)
         self.assertEqual(len(event_summary.userdefined_items), 1)
-        self.assertEqual(len(event_summary.model_perspective_items), 4)
+        self.assertEqual(len(event_summary.model_perspective_items), 5)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
         self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].cpu_time, 100)
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].general_gpu_time,
             135)
@@ -217,8 +218,9 @@ def test_statistic_case2(self):
                                            profiler.TracerEventType.ProfileStep,
                                            0, 400, 1000, 1001)
 
-        dataloader_node = HostPythonNode(
-            'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001)
+        dataloader_node = HostPythonNode('Dataloader',
+                                         profiler.TracerEventType.Dataloader, 5,
+                                         15, 1000, 1001)
 
         mobilenet_node = HostPythonNode(
             'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
@@ -372,7 +374,7 @@ def test_statistic_case2(self):
                 profiler.TracerEventType.ProfileStep), 400)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
-                profiler.TracerEventType.Forward), 100)
+                profiler.TracerEventType.Forward), 90)
         self.assertEqual(
             time_range_summary.get_cpu_range_sum(
                 profiler.TracerEventType.Backward), 80)
@@ -417,12 +419,12 @@ def test_statistic_case2(self):
                 distributed_summary.overlap_range), 85)
         self.assertEqual(len(event_summary.items), 4)
         self.assertEqual(len(event_summary.userdefined_items), 1)
-        self.assertEqual(len(event_summary.model_perspective_items), 4)
+        self.assertEqual(len(event_summary.model_perspective_items), 5)
         self.assertEqual(len(event_summary.memory_manipulation_items), 1)
         self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
         self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 25)
         self.assertEqual(
-            event_summary.model_perspective_items['Forward'].cpu_time, 100)
+            event_summary.model_perspective_items['Forward'].cpu_time, 90)
         self.assertEqual(
             event_summary.model_perspective_items['Forward'].general_gpu_time,
             315)
@@ -441,6 +443,86 @@ def test_statistic_case2(self):
                 thread_sep=False,
                 time_unit='ms'))
 
+    def test_statistic_case3(self):
+        # for coverage, test all time is 0
+        root_node = HostPythonNode('Root Node',
+                                   profiler.TracerEventType.UserDefined, 0,
+                                   float('inf'), 1000, 1001)
+        profilerstep_node = HostPythonNode('ProfileStep#1',
+                                           profiler.TracerEventType.ProfileStep,
+                                           0, 400, 1000, 1001)
+        dataloader_node = HostPythonNode('Dataloader',
+                                         profiler.TracerEventType.Dataloader, 5,
+                                         15, 1000, 1001)
+        mobilenet_node = HostPythonNode(
+            'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
+
+        backward_node = HostPythonNode('Gradient Backward',
+                                       profiler.TracerEventType.Backward, 120,
+                                       200, 1000, 1001)
+        optimization_node = HostPythonNode(
+            'Optimization', profiler.TracerEventType.Optimization, 220, 300,
+            1000, 1001)
+        userdefined_node = HostPythonNode('Communication Time',
+                                          profiler.TracerEventType.UserDefined,
+                                          60, 70, 1000, 1001)
+
+        conv2d_node = HostPythonNode(
+            'conv2d', profiler.TracerEventType.Operator, 25, 25, 1000, 1001)
+
+        conv2d_infer_shape = HostPythonNode(
+            'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25,
+            25, 1000, 1001)
+        conv2d_compute = HostPythonNode('conv2d::compute',
+                                        profiler.TracerEventType.OperatorInner,
+                                        25, 25, 1000, 1001)
+        conv2d_launchkernel = HostPythonNode(
+            'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 25, 25,
+            1000, 1001)
+
+        conv2d_kernel = DevicePythonNode(
+            'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+        another_kernel = DevicePythonNode(
+            'void phi::funcs::VectorizedBroadcastKernel<float, float, phi::funcs::AddFunctor<float>, phi::funcs::AddFunctor<float>>()',
+            profiler.TracerEventType.Kernel, 35, 35, 0, 0, 0)
+        root_node.children_node.append(profilerstep_node)
+        profilerstep_node.children_node.extend([
+            dataloader_node, mobilenet_node, userdefined_node, backward_node,
+            optimization_node
+        ])
+        mobilenet_node.children_node.append(conv2d_node)
+        conv2d_node.children_node.extend([conv2d_infer_shape, conv2d_compute])
+        conv2d_compute.runtime_node.append(conv2d_launchkernel)
+        conv2d_launchkernel.device_node.append(conv2d_kernel)
+        conv2d_launchkernel.device_node.append(another_kernel)
+        thread_tree = {'thread1001': root_node}
+        extra_info = {
+            'Process Cpu Utilization': '1.02',
+            'System Cpu Utilization': '0.68'
+        }
+        statistic_data = profiler.profiler_statistic.StatisticData(thread_tree,
+                                                                   extra_info)
+        time_range_summary = statistic_data.time_range_summary
+        event_summary = statistic_data.event_summary
+
+        self.assertEqual(event_summary.items['conv2d'].cpu_time, 0)
+        self.assertEqual(event_summary.items['conv2d'].general_gpu_time, 0)
+        self.assertEqual(event_summary.userdefined_items['Communication Time']
+                         .general_gpu_time, 0)
+        for sort_key in [
+                profiler.SortedKeys.CPUTotal, profiler.SortedKeys.CPUMax,
+                profiler.SortedKeys.CPUMin, profiler.SortedKeys.CPUAvg,
+                profiler.SortedKeys.GPUTotal, profiler.SortedKeys.GPUMax,
+                profiler.SortedKeys.GPUMin, profiler.SortedKeys.GPUAvg
+        ]:
+            print(
+                profiler.profiler_statistic._build_table(
+                    statistic_data,
+                    sorted_by=sort_key,
+                    op_detail=True,
+                    thread_sep=False,
+                    time_unit='ms'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sort_op.py b/python/paddle/fluid/tests/unittests/test_sort_op.py
index 366e0c7a3fa3e..d678aa835d544 100644
--- a/python/paddle/fluid/tests/unittests/test_sort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sort_op.py
@@ -21,6 +21,7 @@
 import numpy as np
 import six
 import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestSortOnCPU(unittest.TestCase):
@@ -70,14 +71,19 @@ def setUp(self):
         else:
             self.place = core.CPUPlace()
 
-    def test_api_0(self):
+    def func_api_0(self):
         paddle.disable_static(self.place)
         var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x)
         self.assertEqual((np.sort(self.input_data) == out.numpy()).all(), True)
         paddle.enable_static()
 
-    def test_api_1(self):
+    def test_api_0(self):
+        with _test_eager_guard():
+            self.func_api_0()
+        self.func_api_0()
+
+    def func_api_1(self):
         paddle.disable_static(self.place)
         var_x = paddle.to_tensor(self.input_data)
         out = paddle.sort(var_x, axis=-1)
@@ -85,3 +91,8 @@ def test_api_1(self):
             (np.sort(
                 self.input_data, axis=-1) == out.numpy()).all(), True)
         paddle.enable_static()
+
+    def test_api_1(self):
+        with _test_eager_guard():
+            self.func_api_1()
+        self.func_api_1()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 89cfc711910ce..c87626a10c631 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -19,6 +19,8 @@
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 
+devices = ['cpu', 'gpu']
+
 
 class TestSparseCreate(unittest.TestCase):
     def test_create_coo_by_tensor(self):
@@ -30,6 +32,8 @@ def test_create_coo_by_tensor(self):
             dense_elements = paddle.to_tensor(values, dtype='float32')
             coo = paddle.sparse.sparse_coo_tensor(
                 dense_indices, dense_elements, dense_shape, stop_gradient=False)
+            # test the to_string.py
+            print(coo)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -37,7 +41,7 @@ def test_create_coo_by_np(self):
         with _test_eager_guard():
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
-            dense_shape = [2, 3]
+            dense_shape = [3, 3]
             coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
@@ -67,6 +71,8 @@ def test_create_csr_by_np(self):
             dense_shape = [3, 4]
             csr = paddle.sparse.sparse_csr_tensor(crows, cols, values,
                                                   dense_shape)
+            # test the to_string.py
+            print(csr)
             assert np.array_equal(crows, csr.crows().numpy())
             assert np.array_equal(cols, csr.cols().numpy())
             assert np.array_equal(values, csr.values().numpy())
@@ -205,38 +211,154 @@ def test_coo_values_grad(self):
 
     def test_sparse_coo_tensor_grad(self):
         with _test_eager_guard():
-            indices = [[0, 1], [0, 1]]
-            values = [1, 2]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(
-                values, dtype='float32', stop_gradient=False)
-            sparse_x = paddle.sparse.sparse_coo_tensor(
-                indices, values, shape=[2, 2], stop_gradient=False)
-            grad_indices = [[0, 1], [1, 1]]
-            grad_values = [2, 3]
-            grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
-            grad_values = paddle.to_tensor(grad_values, dtype='float32')
-            sparse_out_grad = paddle.sparse.sparse_coo_tensor(
-                grad_indices, grad_values, shape=[2, 2])
-            sparse_x.backward(sparse_out_grad)
-            correct_values_grad = [0, 3]
-            assert np.array_equal(correct_values_grad, values.grad.numpy())
+            for device in devices:
+                if device == 'cpu' or (device == 'gpu' and
+                                       paddle.is_compiled_with_cuda()):
+                    paddle.device.set_device(device)
+                    indices = [[0, 1], [0, 1]]
+                    values = [1, 2]
+                    indices = paddle.to_tensor(indices, dtype='int32')
+                    values = paddle.to_tensor(
+                        values, dtype='float32', stop_gradient=False)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                        indices, values, shape=[2, 2], stop_gradient=False)
+                    grad_indices = [[0, 1], [1, 1]]
+                    grad_values = [2, 3]
+                    grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
+                    grad_values = paddle.to_tensor(grad_values, dtype='float32')
+                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                        grad_indices, grad_values, shape=[2, 2])
+                    sparse_x.backward(sparse_out_grad)
+                    correct_values_grad = [0, 3]
+                    assert np.array_equal(correct_values_grad,
+                                          values.grad.numpy())
 
-            place = core.CPUPlace()
-            indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place)
-            values_cpu = paddle.to_tensor(
-                values, dtype='float32', place=place, stop_gradient=False)
-            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
-                indices_cpu,
-                values_cpu,
-                shape=[2, 2],
-                place=place,
-                stop_gradient=False)
+    def test_sparse_coo_tensor_sorted(self):
+        with _test_eager_guard():
+            for device in devices:
+                if device == 'cpu' or (device == 'gpu' and
+                                       paddle.is_compiled_with_cuda()):
+                    paddle.device.set_device(device)
+                    #test unsorted and duplicate indices 
+                    indices = [[1, 0, 0], [0, 1, 1]]
+                    values = [1.0, 2.0, 3.0]
+                    indices = paddle.to_tensor(indices, dtype='int32')
+                    values = paddle.to_tensor(values, dtype='float32')
+                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    indices_sorted = [[0, 1], [1, 0]]
+                    values_sorted = [5.0, 1.0]
+                    assert np.array_equal(indices_sorted,
+                                          sparse_x.indices().numpy())
+                    assert np.array_equal(values_sorted,
+                                          sparse_x.values().numpy())
+
+
+class TestCooError(unittest.TestCase):
+    def test_small_shape(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                indices = [[2, 3], [0, 2]]
+                values = [1, 2]
+                # 1. the shape too small
+                dense_shape = [2, 2]
+                sparse_x = paddle.sparse.sparse_coo_tensor(
+                    indices, values, shape=dense_shape)
+
+    def test_same_nnz(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                # 2. test the nnz of indices must same as nnz of values
+                indices = [[1, 2], [1, 0]]
+                values = [1, 2, 3]
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+
+    def test_same_dimensions(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                indices = [[1, 2], [1, 0]]
+                values = [1, 2, 3]
+                shape = [2, 3, 4]
+                sparse_x = paddle.sparse.sparse_coo_tensor(
+                    indices, values, shape=shape)
+
+    def test_indices_dtype(self):
+        with _test_eager_guard():
+            with self.assertRaises(TypeError):
+                indices = [[1.0, 2.0], [0, 1]]
+                values = [1, 2]
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+
+
+class TestCsrError(unittest.TestCase):
+    def test_dimension1(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_dimension2(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3, 3, 3, 3]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_same_shape1(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2, 3]
+                values = [1, 2, 3]
+                shape = [3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
 
-            sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor(
-                grad_indices, grad_values, shape=[2, 2], place=place)
-            sparse_x_cpu.backward(sparse_out_grad_cpu)
-            assert np.array_equal(correct_values_grad, values_cpu.grad.numpy())
+    def test_same_shape2(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2, 3]
+                values = [1, 2, 3, 4]
+                shape = [3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_same_shape3(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3, 0, 1, 2]
+                cols = [0, 1, 2, 3, 0, 1, 2]
+                values = [1, 2, 3, 4, 0, 1, 2]
+                shape = [2, 3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_crows_first_value(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [1, 1, 2, 3]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_dtype(self):
+        with _test_eager_guard():
+            with self.assertRaises(TypeError):
+                crows = [0, 1, 2, 3.0]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 724a71ebe3dda..e6e608bea23f4 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -214,6 +214,13 @@ def _test_place(place):
                 self.assertEqual(x.item(), 1 + 1j)
                 self.assertTrue(isinstance(x.item(), complex))
 
+                # empty tensor
+                x = paddle.to_tensor([])
+                self.assertEqual(x.shape, [0])
+                expected_result = np.array([], dtype='float32')
+                self.assertEqual(x.numpy().shape, expected_result.shape)
+                self.assertTrue(np.array_equal(x.numpy(), expected_result))
+
                 numpy_array = np.random.randn(3, 4)
                 # covert core.LoDTensor to paddle.Tensor
                 lod_tensor = paddle.fluid.core.LoDTensor()
@@ -1736,5 +1743,18 @@ def test_copy_gradient_from(self):
         self.func_test_copy_gradient_from()
 
 
+class TestEagerTensorGradNameValue(unittest.TestCase):
+    def test_eager_tensor_grad_name_value(self):
+        with _test_eager_guard():
+            a_np = np.array([2, 3]).astype('float32')
+            a = paddle.to_tensor(a_np)
+            a.stop_gradient = False
+            b = a**2
+            self.assertEqual(a._grad_value(), None)
+            b.backward()
+            self.assertEqual('eager_tmp' in a._grad_name(), True)
+            self.assertNotEqual(a._grad_value(), None)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
new file mode 100644
index 0000000000000..f6893150c9e61
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at #
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.tensor as tensor
+import unittest
+import numpy as np
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from paddle.fluid.framework import Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestBmmOp(XPUOpTestWrapper):
+    """
+    func desc:: https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/bmm_cn.html#bmm
+    """
+
+    def __init__(self):
+        self.op_name = 'bmm'
+        self.use_dynamic_create_class = False
+
+    class TestBmmOp(XPUOpTest):
+        def setUp(self):
+            self.init_dtype()
+            self.set_xpu()
+            self.op_type = "bmm"
+            self.place = paddle.XPUPlace(0)
+            self.set_shape()
+            X = np.random.random(self.Xshape).astype(self.dtype)
+            Y = np.random.random(self.Yshape).astype(self.dtype)
+            self.inputs = {'X': X, 'Y': Y}
+
+            Out = np.matmul(X, Y)
+            self.outputs = {'Out': Out}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def set_shape(self):
+            self.Xshape = (10, 3, 4)
+            self.Yshape = (10, 4, 5)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = False
+            self.__class__.op_type = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    class TestBmmOp1(TestBmmOp):
+        def set_shape(self):
+            self.Xshape = (3, 3, 3)
+            self.Yshape = (3, 3, 3)
+
+    class TestBmmOp2(TestBmmOp):
+        def set_shape(self):
+            self.Xshape = (128, 3, 16)
+            self.Yshape = (128, 16, 3)
+
+    class TestBmmOp3(TestBmmOp):
+        def set_shape(self):
+            self.Xshape = (2048, 16, 27)
+            self.Yshape = (2048, 27, 16)
+
+    class TestBmmOp4(TestBmmOp):
+        def set_shape(self):
+            self.Xshape = (2, 27, 27)
+            self.Yshape = (2, 27, 27)
+
+    class TestBmmOp5(TestBmmOp):
+        def set_shape(self):
+            self.Xshape = (2, 1, 1)
+            self.Yshape = (2, 1, 1)
+
+
+support_types = get_xpu_op_support_types('bmm')
+for stype in support_types:
+    create_test_class(globals(), XPUTestBmmOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
new file mode 100644
index 0000000000000..b4dc8e7b7cfd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAllOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'reduce_all'
+
+    class XPUTestReduceAllBase(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_all'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': True,
+                'dim': (3, 5, 4)
+            }
+            self.inputs = {
+                'X': np.random.randint(0, 2,
+                                       (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    class XPUTestReduceAllCase1(XPUTestReduceAllBase):
+        def set_case(self):
+            self.op_type = 'reduce_all'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': True,
+                'dim': [1]
+            }
+            self.inputs = {
+                'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].all()}
+
+    class XPUTestReduceAllCase2(XPUTestReduceAllBase):
+        def set_case(self):
+            self.op_type = 'reduce_all'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': False,
+                'dim': (3, 6)
+            }
+            self.inputs = {
+                'X': np.random.randint(0, 2,
+                                       (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    class XPUTestReduceAllCase3(XPUTestReduceAllBase):
+        def set_case(self):
+            self.op_type = 'reduce_all'
+            self.attrs = {
+                'use_xpu': True,
+                'keep_dim': True,
+                'dim': [1]
+                # 'reduce_all': True,
+            }
+            self.inputs = {
+                'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")
+            }
+            self.outputs = {
+                'Out': np.expand_dims(
+                    self.inputs['X'].all(axis=1), axis=1)
+            }
+
+
+support_types = get_xpu_op_support_types('reduce_all')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAllOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
index 6ea55f5ba9368..1dd7b42e5eb05 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
@@ -18,56 +18,64 @@
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test_xpu import OpTest, XPUOpTest
-from op_test import skip_check_grad_ci
+
 import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-"""
-class TestXPUReduceMaxOp(XPUOpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.initTestCase()
-        self.use_xpu = True
-        self.use_mkldnn = False
-        self.attrs = {
-            'dim': self.axis,
-            'keep_dim': self.keep_dim,
-            'reduce_all': self.reduce_all
-        }
-        self.inputs = {'X': np.random.random(self.shape).astype('float32')}
-        if self.attrs['reduce_all']:
-            self.outputs = {'Out': self.inputs['X'].max()}
-        else:
-            self.outputs = {
-                'Out': self.inputs['X'].max(axis=self.axis,
-                                            keepdims=self.attrs['keep_dim'])
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceMaxOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'reduce_max'
+
+    class XPUTestReduceMaxBase(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_case()
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_max'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': self.reduce_all,
+                'keep_dim': self.keep_dim
             }
+            self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+            if self.attrs['reduce_all']:
+                self.outputs = {'Out': self.inputs['X'].max()}
+            else:
+                self.outputs = {
+                    'Out': self.inputs['X'].max(axis=self.axis,
+                                                keepdims=self.attrs['keep_dim'])
+                }
+
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+            self.reduce_all = False
+            self.keep_dim = False
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
 
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+        def test_check_grad(self):
+            pass
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+    class XPUTestReduceMaxCase1(XPUTestReduceMaxBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+            self.reduce_all = False
+            self.keep_dim = True
 
-    def init_op_type(self):
-        self.op_type = 'reduce_max'
-        self.use_mkldnn = False
-        self.keep_dim = False
-        self.reduce_all = False
 
-    def initTestCase(self):
-        self.shape = (5, 6, 10)
-        self.axis = (-1, )
-"""
+support_types = get_xpu_op_support_types('reduce_max')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceMaxOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
index 5e866dddbe28e..18a588b1b88da 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
@@ -194,13 +194,5 @@ def setUp(self):
         }
 
 
-class TestReduceAll(Test1DReduce):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
-        self.attrs = {'reduce_all': True, 'use_xpu': True}
-        self.outputs = {'Out': self.inputs['X'].mean()}
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
new file mode 100644
index 0000000000000..cf77ea09a581c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceMinOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'reduce_min'
+
+    class XPUTestReduceMinBase(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_case()
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_min'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': self.reduce_all,
+                'keep_dim': self.keep_dim
+            }
+            self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+            if self.attrs['reduce_all']:
+                self.outputs = {'Out': self.inputs['X'].min()}
+            else:
+                self.outputs = {
+                    'Out': self.inputs['X'].min(axis=self.axis,
+                                                keepdims=self.attrs['keep_dim'])
+                }
+
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+            self.reduce_all = False
+            self.keep_dim = False
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    class XPUTestReduceMinCase1(XPUTestReduceMinBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+            self.reduce_all = False
+            self.keep_dim = True
+
+
+support_types = get_xpu_op_support_types('reduce_min')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceMinOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
index 638da601a3def..9f42a509624b9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
@@ -18,138 +18,64 @@
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test_xpu import OpTest, XPUOpTest
-from op_test import skip_check_grad_ci
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-
-
-class TestXPUReduceSumOp(XPUOpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.initTestCase()
-        self.use_xpu = True
-        self.use_mkldnn = False
-        self.attrs = {
-            'dim': self.axis,
-            'keep_dim': self.keep_dim,
-            'reduce_all': self.reduce_all
-        }
-        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        if self.attrs['reduce_all']:
-            self.outputs = {'Out': self.inputs['X'].sum()}
-        else:
-            self.outputs = {
-                'Out': self.inputs['X'].sum(axis=self.axis,
-                                            keepdims=self.attrs['keep_dim'])
-            }
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
-
-    def init_op_type(self):
-        self.op_type = "reduce_sum"
-        self.use_mkldnn = False
-        self.keep_dim = False
-        self.reduce_all = False
-
-    def initTestCase(self):
-        self.shape = (5, 6, 10)
-        self.axis = (0, )
-
-
-class TestSumOp5D(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (1, 2, 5, 6, 10)
-        self.axis = (0, )
-
-
-class TestSumOp6D(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (1, 1, 2, 5, 6, 10)
-        self.axis = (0, )
-
-
-class TestSumOp8D(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
-        self.axis = (0, 3)
-
-
-class Test1DReduce(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = 120
-        self.axis = (0, )
 
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
-class Test2DReduce0(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (20, 10)
-        self.axis = (0, )
-
-
-class Test2DReduce1(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (20, 10)
-        self.axis = (1, )
-
-
-class Test3DReduce0(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (1, )
-
-
-class Test3DReduce1(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (2, )
-
-
-class Test3DReduce2(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (-2, )
-
-
-class Test3DReduce3(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (1, 2)
-
-
-class TestKeepDimReduce(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 10)
-        self.axis = (1, )
-        self.keep_dim = True
-
+paddle.enable_static()
 
-class TestKeepDim8DReduce(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
-        self.axis = (3, 4, 5)
-        self.keep_dim = True
 
+class XPUTestReduceSumOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'reduce_sum'
 
-class TestReduceAll(TestXPUReduceSumOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 2, 10)
-        self.axis = (0, )
-        self.reduce_all = True
+    class XPUTestReduceSumBase(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_case()
+            self.set_case()
 
+        def set_case(self):
+            self.op_type = 'reduce_sum'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': self.reduce_all,
+                'keep_dim': self.keep_dim
+            }
+            self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+            if self.attrs['reduce_all']:
+                self.outputs = {'Out': self.inputs['X'].sum()}
+            else:
+                self.outputs = {
+                    'Out': self.inputs['X'].sum(axis=self.axis,
+                                                keepdims=self.attrs['keep_dim'])
+                }
+
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+            self.reduce_all = False
+            self.keep_dim = False
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    class XPUTestReduceSumCase1(XPUTestReduceSumBase):
+        def init_case(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+            self.reduce_all = False
+            self.keep_dim = True
+
+
+support_types = get_xpu_op_support_types('reduce_sum')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceSumOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
index fb6b28d9c2825..ee689efbb38a0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
@@ -42,6 +42,7 @@ def setUp(self):
             self.real_np_op = getattr(np, self.real_op_type)
             self.set_xpu()
             self.op_type = "tril_triu"
+            self.place = paddle.XPUPlace(0)
             if self.dtype == np.int32:
                 self.X = np.arange(
                     1, self.get_Xshape_prod() + 1,
@@ -69,13 +70,22 @@ def get_Xshape_prod(self):
 
         def set_xpu(self):
             self.__class__.use_xpu = True
-            self.__class__.no_need_check_grad = True
+            self.__class__.no_need_check_grad = False
             self.__class__.op_type = self.real_op_type
 
         def test_check_output(self):
-            if paddle.is_compiled_with_xpu():
-                place = paddle.XPUPlace(0)
-                self.check_output_with_place(place)
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            if self.dtype == np.int32:
+                user_defined_grad_outputs = np.random.random(
+                    self.Xshape).astype('float32')
+                self.check_grad_with_place(
+                    self.place, ['X'],
+                    'Out',
+                    user_defined_grad_outputs=user_defined_grad_outputs)
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
 
         def initTestCase(self):
             self.diagonal = None
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 287dc7d67def8..907fd4e6252c6 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1534,12 +1534,8 @@ def linear(x, weight, bias=None, name=None):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
-        pre_bias = _C_ops.final_state_matmul(x, weight, False, False)
-
-        if bias is None:
-            return pre_bias
-
-        return _C_ops.final_state_add(pre_bias, bias)
+        #TODO(jiabin): using addmm for fast forward route 
+        return _C_ops.final_state_linear(x, weight, bias)
     else:
         if _in_legacy_dygraph():
             pre_bias = _C_ops.matmul_v2(x, weight, 'trans_x', False, 'trans_y',
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 62f034c7b4149..ca3ac1772829d 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -37,7 +37,7 @@
 from paddle import _C_ops
 from paddle import in_dynamic_mode
 from paddle.framework import core
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
 __all__ = []
 
 
@@ -116,13 +116,13 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
     if in_dygraph_mode():
         out = _C_ops.final_state_bce_loss(input, label)
         if weight is not None:
-            out = _C_ops.elementwise_mul(out, weight, 'axis', -1)
+            out = _C_ops.final_state_multiply(out, weight, 'axis', -1)
 
         if reduction == 'sum':
             return _C_ops.reduce_sum(out, 'dim', [0], 'keep_dim', False,
                                      "reduce_all", True)
         elif reduction == 'mean':
-            return _C_ops.mean(out)
+            return _C_ops.final_state_mean_all(out)
         else:
             return out
     else:
@@ -260,14 +260,17 @@ def binary_cross_entropy_with_logits(logit,
             % reduction)
 
     if _non_static_mode():
-        one = _varbase_creator(dtype=logit.dtype)
-        _C_ops.fill_constant(one, 'value',
-                             float(1.0), 'force_cpu', False, 'dtype', one.dtype,
-                             'str_value', '1.0', 'shape', [1])
         if in_dygraph_mode():
+            one = _C_ops.final_state_full([1],
+                                          float(1.0), core.VarDesc.VarType.FP32,
+                                          _current_expected_place())
             out = _C_ops.final_state_sigmoid_cross_entropy_with_logits(
                 logit, label, False, -100)
         else:
+            one = _varbase_creator(dtype=logit.dtype)
+            _C_ops.fill_constant(one, 'value',
+                                 float(1.0), 'force_cpu', False, 'dtype',
+                                 one.dtype, 'str_value', '1.0', 'shape', [1])
             out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label)
         if pos_weight is not None:
             log_weight = _C_ops.elementwise_add(
@@ -405,7 +408,7 @@ def hsigmoid_loss(input,
             #  [2.2407534]]
     """
 
-    if in_dynamic_mode():
+    if _non_static_mode():
         out, _, _ = _C_ops.hierarchical_sigmoid(
             input, weight, label, path_table, path_code, bias, 'num_classes',
             num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
@@ -582,7 +585,19 @@ def margin_ranking_loss(input,
         raise ValueError(
             "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        out = _C_ops.final_state_subtract(other, input)
+        out = _C_ops.final_state_multiply(out, label)
+        if margin != 0.0:
+            margin = fluid.dygraph.base.to_variable([margin], dtype=out.dtype)
+            out = _C_ops.elementwise_add(out, margin)
+        out = _C_ops.relu(out)
+        if reduction == 'sum':
+            return _C_ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == 'mean':
+            return _C_ops.final_state_mean_all(out)
+        return out
+    elif _in_legacy_dygraph():
         out = _C_ops.elementwise_sub(other, input)
         out = _C_ops.elementwise_mul(out, label)
         if margin != 0.0:
@@ -698,7 +713,17 @@ def l1_loss(input, label, reduction='mean', name=None):
             "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
             "received %s, which is not allowed." % reduction)
 
-    if in_dynamic_mode():
+    if in_dygraph_mode():
+        unreduced = _elementwise_op_in_dygraph(
+            input, label, axis=-1, act='abs', op_name='elementwise_sub')
+        if reduction == 'mean':
+            return _C_ops.final_state_mean_all(unreduced)
+        elif reduction == 'sum':
+            return _C_ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim', False,
+                                     'reduce_all', True)
+        else:
+            return unreduced
+    elif in_dynamic_mode():
         unreduced = _elementwise_op_in_dygraph(
             input, label, axis=-1, act='abs', op_name='elementwise_sub')
         if reduction == 'mean':
@@ -1819,7 +1844,10 @@ def cross_entropy(input,
                                                  'reduce_all', True)
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
-                return _C_ops.mean(out)
+                if in_dygraph_mode():
+                    return _C_ops.final_state_mean_all(out)
+                else:
+                    return _C_ops.mean(out)
 
         else:
             if input_dims - 1 == label_dims:
@@ -2064,6 +2092,8 @@ def sigmoid_focal_loss(logit,
         if reduction == "sum":
             return _C_ops.reduce_sum(loss, 'reduce_all', True)
         elif reduction == "mean":
+            if in_dygraph_mode():
+                return _C_ops.final_state_mean_all(loss)
             return _C_ops.mean(loss)
 
         return loss
@@ -2179,7 +2209,7 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
             "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
             "but received {}.".format(reduction))
 
-    if not in_dynamic_mode():
+    if not _non_static_mode():
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'hinge_embedding_loss')
         check_variable_and_dtype(label, 'label', ['float32', 'float64'],
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 7e40c029a02ec..d4e059b6dfa49 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -18,7 +18,7 @@
 import paddle.fluid as fluid
 import paddle
 from .. import functional as F
-from paddle.fluid.framework import _varbase_creator
+from paddle.fluid.framework import _varbase_creator, in_dygraph_mode, _in_legacy_dygraph
 from .. import Layer
 from paddle import in_dynamic_mode
 
@@ -597,7 +597,11 @@ def forward(self, input, label):
             fluid.data_feeder.check_variable_and_dtype(
                 label, 'label', ['float32', 'float64'], 'MSELoss')
 
-        square_out = paddle.square(paddle.subtract(input, label))
+        if in_dygraph_mode():
+            square_out = paddle._C_ops.final_state_square(
+                paddle.subtract(input, label))
+        else:
+            square_out = paddle.square(paddle.subtract(input, label))
         if self.reduction == 'none':
             return square_out
 
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index d0d5eef03c42c..ea4349bc0b2c5 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1587,7 +1587,7 @@ def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
                                                   verbose)
 
     def get_lr(self):
-        if self.last_epoch > 0:
-            return self.last_lr * self.lr_lambda(self.last_epoch)
-        else:
-            return self.base_lr
+        cur_lr = self.base_lr
+        for epoch in range(1, self.last_epoch + 1):
+            cur_lr = cur_lr * self.lr_lambda(epoch)
+        return cur_lr
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 0af8b8bb894b9..0dfe294c00d5c 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -987,7 +987,9 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
 
         assert regularization_term is not None
 
-        if framework._non_static_mode():
+        if framework.in_dygraph_mode():
+            return _C_ops.final_state_add_n([grad, regularization_term])
+        elif framework._in_legacy_dygraph():
             return _C_ops.sum([grad, regularization_term])
 
         new_grad = grad
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 3e60a82f1214a..77adbaff34859 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -27,6 +27,7 @@
 
 from .utils import RecordEvent, wrap_optimizers
 from .profiler_statistic import StatisticData, _build_table, SortedKeys
+from paddle.profiler import utils
 from .timer import benchmark
 
 
@@ -482,6 +483,7 @@ def start(self):
         if self.timer_only:
             return
         # CLOSED -> self.current_state
+        utils._is_profiler_used = True
         if self.current_state == ProfilerState.READY:
             self.profiler.prepare()
         elif self.current_state == ProfilerState.RECORD:
@@ -534,6 +536,7 @@ def stop(self):
             self.profiler_result = self.profiler.stop()
             if self.on_trace_ready:
                 self.on_trace_ready(self)
+        utils._is_profiler_used = False
 
     def step(self, num_samples: Optional[int]=None):
         r"""
diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py
index 7465a8e80ff8e..422dbe4ce359f 100755
--- a/python/paddle/profiler/profiler_statistic.py
+++ b/python/paddle/profiler/profiler_statistic.py
@@ -78,15 +78,19 @@ def __init__(self, hostnode):
         self.self_gpu_time = 0
         self.general_gpu_time = 0  # besides kernel, include time of gpu events like memcpy and memset
         self.self_general_gpu_time = 0
+        self.is_terminal_operator_node = True
 
     def cal_statistic(self):
         for child in self.children_node:
             child.cal_statistic()
+            if child.is_terminal_operator_node == False:
+                self.is_terminal_operator_node = False
         for rt in self.runtime_node:
             rt.cal_statistic()
-
         self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns
         for child in self.children_node:
+            if child.type == TracerEventType.Operator:
+                self.is_terminal_operator_node = False
             self.gpu_time += child.gpu_time
             self.general_gpu_time += child.general_gpu_time
             self.self_cpu_time -= (child.end_ns - child.start_ns)
@@ -419,10 +423,10 @@ def add_item(self, node):
 
             for runtimenode in node.runtime_node:
                 for devicenode in runtimenode.device_node:
-                    if devicenode.name not in self.devices:
-                        self.devices[devicenode.name] = EventSummary.DeviceItem(
-                            devicenode.name)
-                    self.devices[devicenode.name].add_item(devicenode)
+                    name = devicenode.name
+                    if name not in self.devices:
+                        self.devices[name] = EventSummary.DeviceItem(name)
+                    self.devices[name].add_item(devicenode)
 
     class GeneralItem:
         def __init__(self, name):
@@ -489,6 +493,7 @@ def __init__(self):
             dict)  # for userdefined summary
         self.model_perspective_items = {}  # for model summary
         self.memory_manipulation_items = {}  # for memory manipulation summary
+        self.kernel_items = {}  # for kernel summary
 
     def parse(self, nodetrees):
         r"""
@@ -508,6 +513,7 @@ def parse(self, nodetrees):
                         self.add_memory_manipulation_item(host_statistic_node)
                     else:
                         self.add_userdefined_item(host_statistic_node)
+            self.add_kernel_item(host_statistic_nodes[0])
 
         for threadid, root_statistic_node in node_statistic_trees.items():
             deque = collections.deque()
@@ -525,11 +531,7 @@ def parse(self, nodetrees):
                         deque.append(child)
 
     def add_operator_item(self, operator_node):
-        have_inner = False
-        for child in operator_node.children_node:
-            if child.type == TracerEventType.OperatorInner:
-                have_inner = True
-        if have_inner == False:
+        if operator_node.is_terminal_operator_node == False:
             return
         if operator_node.name not in self.items:
             self.items[operator_node.name] = EventSummary.OperatorItem(
@@ -585,6 +587,15 @@ def add_model_perspective_item(self, model_perspective_node):
             self.model_perspective_items[name] = EventSummary.GeneralItem(name)
         self.model_perspective_items[name].add_item(model_perspective_node)
 
+    def add_kernel_item(self, root_node):
+        device_nodes = get_device_nodes(root_node)
+        for device_node in device_nodes:
+            if device_node.type == TracerEventType.Kernel:
+                name = device_node.name
+                if name not in self.kernel_items:
+                    self.kernel_items[name] = EventSummary.DeviceItem(name)
+                self.kernel_items[name].add_item(device_node)
+
 
 class StatisticData:
     r"""
@@ -752,6 +763,9 @@ def format_ratio(ratio, indent=0):
             cpu_call_times[
                 event_type] = statistic_data.event_summary.model_perspective_items[
                     event_type_name].call
+            cpu_type_time[
+                event_type] = statistic_data.event_summary.model_perspective_items[
+                    event_type_name].cpu_time
 
     gpu_time_range = collections.defaultdict(list)
     for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
@@ -800,7 +814,6 @@ def format_ratio(ratio, indent=0):
     append(
         "Note:\nIn this table, We sum up all collected events in terms of event type.\n"
         "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n"
-        "The time with ratio 100% is the base time for calculating ratio. \n"
         "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n"
         "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n"
         "Example:\n"
@@ -820,13 +833,18 @@ def format_ratio(ratio, indent=0):
         all_row_values = []
         accmulation_time = 0
         gpu_accmulation_time = 0
-        gpu_total_time = 0
+        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+            'ProfileStep'].general_gpu_time
         for name in [
                 'ProfileStep', 'Dataloader', 'Forward', 'Backward',
                 'Optimization'
         ]:
             if name in model_perspective_items:
                 item = model_perspective_items[name]
+                if gpu_total_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(item.general_gpu_time) / gpu_total_time
                 name = '{}'.format(
                     name) if 'ProfileStep' in name else '  {}'.format(name)
                 row_values = [
@@ -850,17 +868,19 @@ def format_ratio(ratio, indent=0):
                             item.max_gpu_time, unit=time_unit),
                         format_time(
                             item.min_gpu_time, unit=time_unit),
-                        format_ratio(float(item.gpu_time) / total_time))
+                        format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
                 if 'ProfileStep' not in name:
                     accmulation_time += item.cpu_time
-                    gpu_accmulation_time += item.gpu_time
-                else:
-                    gpu_total_time = item.gpu_time
+                    gpu_accmulation_time += item.general_gpu_time
 
         other_time = total_time - accmulation_time
         other_gpu_time = gpu_total_time - gpu_accmulation_time
+        if gpu_total_time == 0:
+            gpu_ratio = 0
+        else:
+            gpu_ratio = float(other_gpu_time) / gpu_total_time
         row_values = [
             '  Others', '-', '{} / - / - / - / {}'.format(
                 format_time(
@@ -869,7 +889,7 @@ def format_ratio(ratio, indent=0):
             '{} / - / - / - / {}'.format(
                 format_time(
                     other_gpu_time, unit=time_unit),
-                format_ratio(float(other_gpu_time) / total_time))
+                format_ratio(gpu_ratio))
         ]
         all_row_values.append(row_values)
         # Calculate the column width
@@ -913,7 +933,6 @@ def format_ratio(ratio, indent=0):
         append(
             "Note:\nIn this table, GPU time is the sum of all device(GPU) events called in the phase.\n"
             "Unlike overview summary, if two device(GPU) events execute on different streams with overlap time, we sum them directly here.\n"
-            "The time with ratio 100% is the base time for calculating ratio. \n"
         )
         append('-' * line_length)
         append('')
@@ -981,7 +1000,6 @@ def format_ratio(ratio, indent=0):
             "Note:\nCommunication time: Communication Event time, Communication Op time and its kernel time on gpu.\n"
             "Computation time: Kernel time, except kernels belong to communication(nccl kernels).\n"
             "Overlap time: Communication time intersects with computation time.\n"
-            "The time with ratio 100% is the base time for calculating ratio. \n"
             "Example:\n"
             "Communication:\n"
             "  CPU:              |_________________|\n"
@@ -1040,8 +1058,22 @@ def format_ratio(ratio, indent=0):
             elif sorted_by == SortedKeys.GPUMin:
                 sorted_items = sorted(
                     items.items(), key=lambda x: x[1].min_general_gpu_time)
+            total_op_cpu_time = 0
+            total_op_gpu_time = 0
+
+            for name, item in sorted_items:
+                total_op_cpu_time += item.cpu_time
+                total_op_gpu_time += item.general_gpu_time
 
             for name, item in sorted_items:
+                if total_op_cpu_time == 0:
+                    cpu_ratio = 0
+                else:
+                    cpu_ratio = float(item.cpu_time) / total_op_cpu_time
+                if total_op_gpu_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(item.general_gpu_time) / total_op_gpu_time
                 row_values = [
                     name, item.call, '{} / {} / {} / {} / {}'.format(
                         format_time(
@@ -1052,7 +1084,7 @@ def format_ratio(ratio, indent=0):
                             item.max_cpu_time, unit=time_unit),
                         format_time(
                             item.min_cpu_time, unit=time_unit),
-                        format_ratio(float(item.cpu_time) / total_time)),
+                        format_ratio(cpu_ratio)),
                     '{} / {} / {} / {} / {}'.format(
                         format_time(
                             item.general_gpu_time, unit=time_unit),
@@ -1062,13 +1094,22 @@ def format_ratio(ratio, indent=0):
                             item.max_general_gpu_time, unit=time_unit),
                         format_time(
                             item.min_general_gpu_time, unit=time_unit),
-                        format_ratio(
-                            float(item.general_gpu_time) / total_time))
+                        format_ratio(gpu_ratio))
                 ]
                 all_row_values.append(row_values)
                 if op_detail:
                     for innerop_name, innerop_node in item.operator_inners.items(
                     ):
+                        if item.cpu_time == 0:
+                            cpu_ratio = 0
+                        else:
+                            cpu_ratio = float(
+                                innerop_node.cpu_time) / item.cpu_time
+                        if item.general_gpu_time == 0:
+                            gpu_ratio = 0
+                        else:
+                            gpu_ratio = float(innerop_node.general_gpu_time
+                                              ) / item.general_gpu_time
                         if len(innerop_name) + 2 > name_column_width:
                             innerop_name = innerop_name[:name_column_width - 5]
                             innerop_name += "..."
@@ -1083,8 +1124,7 @@ def format_ratio(ratio, indent=0):
                                     innerop_node.max_cpu_time, unit=time_unit),
                                 format_time(
                                     innerop_node.min_cpu_time, unit=time_unit),
-                                format_ratio(
-                                    float(innerop_node.cpu_time) / total_time)),
+                                format_ratio(cpu_ratio)),
                             '{} / {} / {} / {} / {}'.format(
                                 format_time(
                                     innerop_node.general_gpu_time,
@@ -1098,13 +1138,17 @@ def format_ratio(ratio, indent=0):
                                 format_time(
                                     innerop_node.min_general_gpu_time,
                                     unit=time_unit),
-                                format_ratio(
-                                    float(innerop_node.general_gpu_time) /
-                                    total_time))
+                                format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
                         for device_node_name, device_node in innerop_node.devices.items(
                         ):
+                            if innerop_node.general_gpu_time == 0:
+                                gpu_ratio = 0
+                            else:
+                                gpu_ratio = float(
+                                    device_node.
+                                    gpu_time) / innerop_node.general_gpu_time
                             if len(device_node_name) + 4 > name_column_width:
                                 device_node_name = device_node_name[:
                                                                     name_column_width
@@ -1125,12 +1169,15 @@ def format_ratio(ratio, indent=0):
                                     format_time(
                                         device_node.min_gpu_time,
                                         unit=time_unit),
-                                    format_ratio(
-                                        float(device_node.gpu_time) /
-                                        total_time))
+                                    format_ratio(gpu_ratio))
                             ]
                             all_row_values.append(row_values)
                     for device_node_name, device_node in item.devices.items():
+                        if item.general_gpu_time == 0:
+                            gpu_ratio = 0
+                        else:
+                            gpu_ratio = float(
+                                device_node.gpu_time) / item.general_gpu_time
                         if len(device_node_name) + 2 > name_column_width:
                             device_node_name = device_node_name[:
                                                                 name_column_width
@@ -1148,8 +1195,7 @@ def format_ratio(ratio, indent=0):
                                     device_node.max_gpu_time, unit=time_unit),
                                 format_time(
                                     device_node.min_gpu_time, unit=time_unit),
-                                format_ratio(
-                                    float(device_node.gpu_time) / total_time))
+                                format_ratio(gpu_ratio))
                         ]
                         all_row_values.append(row_values)
         # Calculate the column width
@@ -1197,11 +1243,106 @@ def format_ratio(ratio, indent=0):
         append('')
         append('')
 
+    ###### Print Kernel Summary Report ######
+    if statistic_data.event_summary.kernel_items:
+        all_row_values = []
+        kernel_items = statistic_data.event_summary.kernel_items
+        if sorted_by == SortedKeys.GPUAvg:
+            sorted_items = sorted(
+                kernel_items.items(),
+                key=lambda x: x[1].avg_gpu_time,
+                reverse=True)
+        elif sorted_by == SortedKeys.GPUMax:
+            sorted_items = sorted(
+                kernel_items.items(),
+                key=lambda x: x[1].max_gpu_time,
+                reverse=True)
+        elif sorted_by == SortedKeys.GPUMin:
+            sorted_items = sorted(
+                kernel_items.items(), key=lambda x: x[1].min_gpu_time)
+        else:
+            sorted_items = sorted(
+                kernel_items.items(), key=lambda x: x[1].gpu_time, reverse=True)
+
+        total_kernel_gpu_time = 0
+        for name, item in sorted_items:
+            total_kernel_gpu_time += item.gpu_time
+        for name, item in sorted_items:
+            if total_kernel_gpu_time == 0:
+                gpu_ratio = 0
+            else:
+                gpu_ratio = float(item.gpu_time) / total_kernel_gpu_time
+            row_values = [
+                name,
+                item.call,
+                '{} / {} / {} / {} / {}'.format(
+                    format_time(
+                        item.gpu_time, unit=time_unit),
+                    format_time(
+                        item.avg_gpu_time, unit=time_unit),
+                    format_time(
+                        item.max_gpu_time, unit=time_unit),
+                    format_time(
+                        item.min_gpu_time, unit=time_unit),
+                    format_ratio(gpu_ratio)),
+            ]
+            all_row_values.append(row_values)
+
+        headers = ['Name', 'Calls', 'GPU Total / Avg / Max / Min / Ratio(%)']
+        # Calculate the column width
+        name_column_width = 90
+        calltime_width = 6
+        gpu_data_description_width = 40
+        for row_values in all_row_values:
+            if isinstance(row_values[1],
+                          int) and len(str(row_values[1])) > calltime_width:
+                calltime_width = len(str(row_values[1]))
+            if len(row_values[2]) > gpu_data_description_width:
+                gpu_data_description_width = len(row_values[2])
+
+        row_format_list = [""]
+        header_sep_list = [""]
+        line_length_list = [-SPACING_SIZE]
+        add_column(name_column_width)
+        add_column(calltime_width)
+        add_column(gpu_data_description_width)
+
+        row_format = row_format_list[0]
+        header_sep = header_sep_list[0]
+        line_length = line_length_list[0]
+
+        # construct table string
+        append(add_title(line_length, "Kernel Summary"))
+        append('Time unit: {}'.format(time_unit))
+        append(header_sep)
+        append(row_format.format(*headers))
+        append(header_sep)
+        for row_values in all_row_values:
+            indx = row_values[0].find('(')
+            if indx != -1:
+                name = row_values[0][:indx]
+            else:
+                name = row_values[0]
+            if len(name) > name_column_width:
+                row_values[0] = name[:name_column_width - 3] + '...'
+            else:
+                row_values[0] = name
+            append(row_format.format(*row_values))
+        append(header_sep)
+        append('')
+        append('')
+
     ###### Print Memory Manipulation Summary Report ######
     if statistic_data.event_summary.memory_manipulation_items:
         all_row_values = []
         memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
+        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+            'ProfileStep'].general_gpu_time
         for name, item in memory_manipulation_items.items():
+            if gpu_total_time == 0:
+                gpu_ratio = 0
+            else:
+                gpu_ratio = float(item.general_gpu_time) / gpu_total_time
             row_values = [
                 name,
                 item.call,
@@ -1224,7 +1365,7 @@ def format_ratio(ratio, indent=0):
                         item.max_general_gpu_time, unit=time_unit),
                     format_time(
                         item.min_general_gpu_time, unit=time_unit),
-                    format_ratio(float(item.general_gpu_time) / total_time)),
+                    format_ratio(gpu_ratio)),
             ]
             all_row_values.append(row_values)
 
@@ -1274,6 +1415,8 @@ def format_ratio(ratio, indent=0):
     ###### Print UserDefined Summary Report ######
     if statistic_data.event_summary.userdefined_items:
         all_row_values = []
+        gpu_total_time = statistic_data.event_summary.model_perspective_items[
+            'ProfileStep'].general_gpu_time
         if thread_sep == True:
             userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
         else:
@@ -1319,6 +1462,10 @@ def format_ratio(ratio, indent=0):
                     items.items(), key=lambda x: x[1].min_general_gpu_time)
 
             for name, item in sorted_items:
+                if gpu_total_time == 0:
+                    gpu_ratio = 0
+                else:
+                    gpu_ratio = float(item.general_gpu_time) / gpu_total_time
                 row_values = [
                     name,
                     item.call,
@@ -1341,8 +1488,7 @@ def format_ratio(ratio, indent=0):
                             item.max_general_gpu_time, unit=time_unit),
                         format_time(
                             item.min_general_gpu_time, unit=time_unit),
-                        format_ratio(
-                            float(item.general_gpu_time) / total_time)),
+                        format_ratio(gpu_ratio)),
                 ]
                 all_row_values.append(row_values)
 
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index 291326478e91b..6ae3fe4e60b92 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -20,6 +20,8 @@
 from paddle.fluid import core
 from paddle.fluid.core import (_RecordEvent, TracerEventType)
 
+_is_profiler_used = False
+
 _AllowedEventTypeList = [
     TracerEventType.Dataloader, TracerEventType.ProfileStep,
     TracerEventType.UserDefined, TracerEventType.Forward,
@@ -91,6 +93,8 @@ def begin(self):
                 result = data1 - data2
                 record_event.end()
         """
+        if not _is_profiler_used:
+            return
         if self.event_type not in _AllowedEventTypeList:
             warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
                   can be recorded.".format(*_AllowedEventTypeList))
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index ac9276f3142c0..d494336e1ff50 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle import _C_ops
 from ..framework import core, dygraph_only
 from ..framework import _current_expected_place, _get_paddle_place
@@ -51,6 +52,13 @@ def _get_place(place):
     return place
 
 
+def _check_indices_dtype(dtype):
+    if dtype not in [paddle.int8, paddle.int16, paddle.int32, paddle.int64]:
+        raise TypeError(
+            "the dtype of indices must be 'int8' or 'int16' or 'int32' or 'int64'"
+        )
+
+
 @dygraph_only
 def sparse_coo_tensor(indices,
                       values,
@@ -117,6 +125,18 @@ def sparse_coo_tensor(indices,
     if len(indices.shape) != 2:
         raise ValueError("'indices' must be 2-D.")
 
+    nnz = indices.shape[1]
+    sparse_dim = indices.shape[0]
+
+    _check_indices_dtype(indices.dtype)
+
+    if nnz != values.shape[0]:
+        raise ValueError(
+            "the indices and values must have same number of non-zero, but get {} and {}".
+            format(nnz, values.shape[0]))
+
+    dense_dim = len(values.shape) - 1
+
     if not indices.place._equals(place):
         indices = indices._copy_to(place, False)
 
@@ -125,8 +145,17 @@ def sparse_coo_tensor(indices,
     values = _handle_dtype(values, dtype)
     values.stop_gradient = stop_gradient
 
+    min_shape = _infer_dense_shape(indices)
     if shape is None:
-        shape = _infer_dense_shape(indices)
+        shape = min_shape
+    else:
+        if shape < min_shape:
+            raise ValueError("the minimun shape required is {}, but get {}".
+                             format(min_shape, shape))
+        if len(shape) != sparse_dim + dense_dim:
+            raise ValueError(
+                "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}".
+                format(sparse_dim, dense_dim, len(shape)))
 
     return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices,
                                                               shape)
@@ -144,6 +173,7 @@ def sparse_csr_tensor(crows,
     r"""
     Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the 
     ``crows``, ``cols`` and ``values``.
+    Currently, the crows and cols of each batch must be incrementd.
 
     Args:
         crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the 
@@ -202,10 +232,14 @@ def sparse_csr_tensor(crows,
         cols = to_tensor(cols, dtype=None, place=place, stop_gradient=True)
     if not isinstance(values, core.eager.Tensor):
         values = to_tensor(values, dtype, place, stop_gradient)
-    if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1:
+
+    _check_indices_dtype(crows.dtype)
+    _check_indices_dtype(cols.dtype)
+
+    if len(shape) != 2 and len(shape) != 3:
         raise ValueError(
-            "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D."
-        )
+            "SparseCsrTensor only support 2-D or 3-D matrix. but get shape {}".
+            format(shape))
 
     if not crows.place._equals(place):
         crows = crows._copy_to(place, False)
@@ -217,5 +251,30 @@ def sparse_csr_tensor(crows,
         values = values._copy_to(place, False)
     values = _handle_dtype(values, dtype)
     values.stop_gradient = stop_gradient
+
+    if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1:
+        raise ValueError("The 'crows', 'cols' and 'values' must be 1-D.")
+
+    if (len(cols) != len(values)):
+        raise ValueError("the length of cols must be same as length of values")
+
+    if len(shape) == 2:
+        if crows.shape[0] != shape[0] + 1:
+            raise ValueError(
+                "The length({}) of crows must be equal to the rows({})+1 of matrix.".
+                format(crows.shape[0], shape[0]))
+        if crows[0] != 0:
+            raise ValueError("the 0th value of crows must be 0")
+
+        if crows[-1] != values.shape[0]:
+            raise ValueError(
+                "the last value of crows must be equal the number of non-zero")
+    else:
+        if crows.shape[0] % (shape[0] + 1) != 0:
+            raise ValueError(
+                "The length({}) of crows must be divisible the rows({})+1 of matrix.".
+                format(crows.shape[0], shape[0]))
+    # TODO(zkh2016): check whether the value in crows and cols is legal 
+
     return core.eager.sparse_csr_tensor(crows, cols, values, shape,
                                         stop_gradient)
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 7c0c71951aa1d..f58c06c9b51b6 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -147,8 +147,8 @@ class InputSpec(object):
             input = InputSpec([None, 784], 'float32', 'x')
             label = InputSpec([None, 1], 'int64', 'label')
 
-            print(input)  # InputSpec(shape=(-1, 784), dtype=VarType.FP32, name=x)
-            print(label)  # InputSpec(shape=(-1, 1), dtype=VarType.INT64, name=label)
+            print(input)  # InputSpec(shape=(-1, 784), dtype=paddle.float32, name=x)
+            print(label)  # InputSpec(shape=(-1, 1), dtype=paddle.int64, name=label)
     """
 
     def __init__(self, shape, dtype='float32', name=None):
@@ -190,7 +190,7 @@ def from_tensor(cls, tensor, name=None):
 
                 x = paddle.to_tensor(np.ones([2, 2], np.float32))
                 x_spec = InputSpec.from_tensor(x, name='x')
-                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=paddle.float32, name=x)
 
         """
         if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)):
@@ -219,7 +219,7 @@ def from_numpy(cls, ndarray, name=None):
 
                 x = np.ones([2, 2], np.float32)
                 x_spec = InputSpec.from_numpy(x, name='x')
-                print(x_spec)  # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x)
+                print(x_spec)  # InputSpec(shape=(2, 2), dtype=paddle.float32, name=x)
 
         """
         return cls(ndarray.shape, ndarray.dtype, name)
@@ -241,7 +241,7 @@ def batch(self, batch_size):
 
                 x_spec = InputSpec(shape=[64], dtype='float32', name='x')
                 x_spec.batch(4)
-                print(x_spec) # InputSpec(shape=(4, 64), dtype=VarType.FP32, name=x)
+                print(x_spec) # InputSpec(shape=(4, 64), dtype=paddle.float32, name=x)
 
         """
         if isinstance(batch_size, (list, tuple)):
@@ -273,7 +273,7 @@ def unbatch(self):
 
                 x_spec = InputSpec(shape=[4, 64], dtype='float32', name='x')
                 x_spec.unbatch()
-                print(x_spec) # InputSpec(shape=(64,), dtype=VarType.FP32, name=x)
+                print(x_spec) # InputSpec(shape=(64,), dtype=paddle.float32, name=x)
 
         """
         if len(self.shape) == 0:
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index f4f1e7a3d5067..aeec256bc1580 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -146,6 +146,130 @@ def linspace(start, stop, num, dtype=None, name=None):
     return out
 
 
+def logspace(start, stop, num, base=10.0, dtype=None, name=None):
+    r"""
+    Return fixed number of logarithmical-evenly spaced values within the interval \
+    :math:`[base^{start}, base^{stop}]`.
+    
+    Notes:
+        This API does not compute the gradient.
+    
+    Args:
+        start(int|float|Tensor): The input :attr:`start` is exponent of first entry in \
+            the sequence. It is a scalar, or a Tensor of shape [1] with input data \
+            type int32, int64, float32 or float64.
+        stop(int|float|Tensor): The input :attr:`stop` is exponent of last entry in the \
+            sequence. It is a scalar, or a Tensor of shape [1] with input data \
+            type int32, int64, float32 or float64.
+        num(int|Tensor): The input :attr:`num` is given number of items in the sequence. \
+            It is an int scalar, or a Tensor of shape [1] with data type int32.
+        base(int|float|Tensor): The input :attr:`base` is base of the logarithm function. \
+            It is a scalar, or a Tensor of shape [1] with input data type int32, int64, \
+            float32 or float64.
+        dtype(np.dtype|str, optional): The data type of output tensor, it could be \
+            int32, int64, float32 or float64. Default: if None, the data type is float32. \
+        name(str, optional): Normally there is no need for user to set this property. \
+            For more information, please refer to :ref:`api_guide_Name`. Default: None.
+
+    Returns:
+        Tensor: The output data type will be float32, float64. The 1-D tensor with \
+        fixed number of logarithmical-evenly spaced values, the data shape of this \
+        tensor is :math:`[num]`. If the :attr:`num` is set 1, the output tensor \
+        just has the value with exponential of :attr:`start` with base :attr:`base`. 
+
+    Examples:
+        .. code-block:: python
+            :name: logspace-example
+
+            import paddle
+            data = paddle.logspace(0, 10, 5, 2, 'float32')
+            # [1.          , 5.65685415  , 32.         , 181.01933289, 1024.       ]
+            data = paddle.logspace(0, 10, 1, 2, 'float32')
+            # [1.]
+    """
+    if dtype is None:
+        dtype = 'float32'
+    tensor_num = num
+    tensor_start = start
+    tensor_stop = stop
+    tensor_base = base
+    if not isinstance(num, Variable):
+        check_type(num, 'num', (int), 'logspace')
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    if not isinstance(start, Variable):
+        with device_guard("cpu"):
+            tensor_start = fill_constant([1], dtype, start)
+    if not isinstance(stop, Variable):
+        with device_guard("cpu"):
+            tensor_stop = fill_constant([1], dtype, stop)
+    if not isinstance(num, Variable):
+        with device_guard("cpu"):
+            tensor_num = fill_constant([1], 'int32', num)
+    if not isinstance(base, Variable):
+        with device_guard("cpu"):
+            tensor_base = fill_constant([1], dtype, base)
+    if _non_static_mode():
+        return _C_ops.logspace(tensor_start, tensor_stop, tensor_num,
+                               tensor_base, 'dtype', dtype)
+
+    helper = LayerHelper("logspace", **locals())
+
+    start_dtype = convert_dtype(tensor_start.dtype)
+    stop_dtype = convert_dtype(tensor_stop.dtype)
+    base_dtype = convert_dtype(tensor_base.dtype)
+    out_dtype = convert_dtype(dtype)
+    if isinstance(start, Variable):
+        check_dtype(start.dtype, 'start',
+                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+    else:
+        check_type(start, 'start', (int, float), 'logspace')
+
+    if isinstance(stop, Variable):
+        check_dtype(stop.dtype, 'stop',
+                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+    else:
+        check_type(stop, 'stop', (int, float), 'logspace')
+
+    if isinstance(num, Variable):
+        check_dtype(num.dtype, 'num', ['int32'], 'logspace')
+
+    if isinstance(base, Variable):
+        check_dtype(base.dtype, 'base',
+                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+    else:
+        check_type(base, 'base', (int, float), 'logspace')
+
+    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
+                'logspace')
+    if ((stop_dtype == "float64" or start_dtype == "float64"
+                                 or base_dtype == "float64")
+                                 and out_dtype in ["float32", "int32"]) or \
+       ((stop_dtype == "int64" or start_dtype == "int64"
+                               or base_dtype == "int64")
+                               and out_dtype == "int32"):
+        raise ValueError(
+            "The dtype of start/stop/base is {}/{}/{} but the attr(dtype) of logspace is {}, "
+            "which may cause data type overflows. Please reset attr(dtype) of logspace."
+            .format(start_dtype, stop_dtype, base_dtype, dtype))
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    helper.append_op(
+        type='logspace',
+        inputs={
+            'Start': tensor_start,
+            'Stop': tensor_stop,
+            'Num': tensor_num,
+            'Base': tensor_base
+        },
+        attrs={'dtype': dtype},
+        outputs={'Out': [out]})
+    if isinstance(num, int):
+        out.desc.set_shape((num, ))
+    return out
+
+
 @dygraph_only
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     r"""
@@ -216,9 +340,9 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         place = _current_expected_place()
     elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
                                 core.CUDAPlace, core.NPUPlace, core.XPUPlace,
-                                core.CustomPlace)):
+                                core.MLUPlace, core.CustomPlace)):
         raise ValueError(
-            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace"
+            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.MLUPlace, paddle.CustomPlace"
         )
 
     if not isinstance(data, np.ndarray):
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 9c2074bbe3cda..2c1732ad62848 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -758,10 +758,13 @@ def mat_norm(input, porder=1., axis=None):
         axis = axis if axis != None and axis != [] else [0]
         keepdim = False
 
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
             abs_out = _C_ops.abs(input)
-            sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
-                                        keepdim, 'reduce_all', reduce_all)
+            if in_dygraph_mode():
+                sum_out = _C_ops.final_state_sum(abs_out, axis, None, keepdim)
+            else:
+                sum_out = _C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
+                                            keepdim, 'reduce_all', reduce_all)
             if porder == 1 or porder == np.inf:
                 return _C_ops.reduce_max(sum_out, 'dim', [-1], 'keepdim',
                                          keepdim, 'reduce_all', reduce_all)
@@ -815,7 +818,12 @@ def fro_norm(input, porder=2, axis=[-1]):
         reduce_all = True if axis is None or axis == [] else False
         keepdim = False
 
-        if paddle.in_dynamic_mode():
+        if in_dygraph_mode():
+            pow_out = _C_ops.pow(input, 'factor', porder)
+            sum_out_1 = _C_ops.final_state_sum(pow_out, axis, None, keepdim)
+            sum_out_2 = _C_ops.final_state_sum(sum_out_1, axis, None, keepdim)
+            return _C_ops.pow(sum_out_2, 'factor', float(1. / porder))
+        elif paddle.in_dynamic_mode():
             pow_out = _C_ops.pow(input, 'factor', porder)
             sum_out_1 = _C_ops.reduce_sum(pow_out, 'dim', axis, 'keepdim',
                                           keepdim, 'reduce_all', reduce_all)
@@ -869,10 +877,13 @@ def svd_norm(input, porder, axis=[-1]):
 
         u, s, vh = svd(input, full_matrices=False)
 
-        if paddle.in_dynamic_mode():
+        if _non_static_mode():
             if porder == "nuc":
-                return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
-                                         'reduce_all', reduce_all)
+                if in_dygraph_mode():
+                    return _C_ops.final_state_sum(s, axis, None, keepdim)
+                else:
+                    return _C_ops.reduce_sum(s, 'dim', axis, 'keepdim', keepdim,
+                                             'reduce_all', reduce_all)
             max_out = _C_ops.reduce_max(s, 'dim', axis, 'keepdim', keepdim,
                                         'reduce_all', reduce_all)
             min_out = _C_ops.reduce_min(s, 'dim', axis, 'keepdim', keepdim,
@@ -1155,40 +1166,37 @@ def t(input, name=None):
     the paddle.transpose function which perm dimensions set 0 and 1.
 
     Args:
-        input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float16, float32, float64, int32.
+        input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float32, float64, int32, int64.
         name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
         Tensor: A transposed n-D Tensor, with data type being float16, float32, float64, int32, int64.
 
-    For Example:
-
-        .. code-block:: text
+    Examples:
 
+        .. code-block:: python
+           :name: code-example
+             import paddle
+             
              # Example 1 (0-D tensor)
-             x = tensor([0.79])
-             paddle.t(x) = tensor([0.79])
-
+             x = paddle.to_tensor([0.79])
+             paddle.t(x) # [0.79]
+             
              # Example 2 (1-D tensor)
-             x = tensor([0.79, 0.84, 0.32])
-             paddle.t(x) = tensor([0.79, 0.84, 0.32])
+             x = paddle.to_tensor([0.79, 0.84, 0.32])
+             paddle.t(x) # [0.79000002, 0.83999997, 0.31999999]
+             paddle.t(x).shape # [3]
 
              # Example 3 (2-D tensor)
-             x = tensor([0.79, 0.84, 0.32],
-                        [0.64, 0.14, 0.57])
-             paddle.t(x) = tensor([0.79, 0.64],
-                                  [0.84, 0.14],
-                                  [0.32, 0.57])
+             x = paddle.to_tensor([[0.79, 0.84, 0.32],
+                                  [0.64, 0.14, 0.57]])
+             x.shape # [2, 3]
+             paddle.t(x)
+             # [[0.79000002, 0.63999999],
+             #  [0.83999997, 0.14000000],
+             #  [0.31999999, 0.56999999]]
+             paddle.t(x).shape # [3, 2]
 
-     Examples:
-
-        .. code-block:: python
-
-            import paddle
-            x = paddle.ones(shape=[2, 3], dtype='int32')
-            x_transposed = paddle.t(x)
-            print(x_transposed.shape)
-            # [3, 2]
     """
     if len(input.shape) > 2:
         raise ValueError(
@@ -1433,7 +1441,6 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
     if tol is None:
         attrs['use_default_tol'] = True
     elif isinstance(tol, Variable):
-        check_variable_and_dtype(tol, 'tol', ['float32'], 'matrix_rank')
         attrs['use_default_tol'] = False
         if tol.dtype != x.dtype:
             inputs['TolTensor'] = cast(tol, x.dtype)
@@ -2530,7 +2537,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             # or              out * x * out = x ;
     """
 
-    if paddle.in_dynamic_mode():
+    if _non_static_mode():
         if not hermitian:
             # combine svd and matmul op
             u, s, vt = _C_ops.svd(x, 'full_matrices', False)
@@ -2554,8 +2561,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             v, _ = _C_ops.transpose2(vt, 'axis', perm)
 
             out_1 = v * st
-            out_2 = _C_ops.matmul_v2(out_1, u, 'trans_x', False, 'trans_y',
-                                     True)
+            if in_dygraph_mode():
+                out_2 = _C_ops.final_state_matmul(out_1, u, False, True)
+            else:
+                out_2 = _C_ops.matmul_v2(out_1, u, 'trans_x', False, 'trans_y',
+                                         True)
             return out_2
         else:
             # combine eigh and matmul op
@@ -2578,8 +2588,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             out_1 = u * st
             u_conj = _C_ops.conj(u)
-            out_2 = _C_ops.matmul_v2(out_1, u_conj, 'trans_x', False, 'trans_y',
-                                     True)
+            if in_dygraph_mode():
+                out_2 = _C_ops.final_state_matmul(out_1, u_conj, False, True)
+            else:
+                out_2 = _C_ops.matmul_v2(out_1, u_conj, 'trans_x', False,
+                                         'trans_y', True)
             return out_2
     else:
         if not hermitian:
@@ -3080,7 +3093,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
         elif x.dtype == paddle.float64:
             rcond = 1e-15 * max(x.shape[-2], x.shape[-1])
 
-    if paddle.in_dynamic_mode():
+    if _non_static_mode():
         solution, rank, singular_values = _C_ops.lstsq(x, y, "rcond", rcond,
                                                        "driver", driver)
         if x.shape[-2] > x.shape[-1]:
@@ -3089,8 +3102,11 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
                           False)
             minus_out = _C_ops.elementwise_sub(matmul_out, y)
             pow_out = _C_ops.pow(minus_out, 'factor', 2)
-            residuals = _C_ops.reduce_sum(pow_out, 'dim', [-2], 'keepdim',
-                                          False, 'reduce_all', False)
+            if in_dygraph_mode():
+                residuals = _C_ops.final_state_sum(pow_out, [-2], None, False)
+            else:
+                residuals = _C_ops.reduce_sum(pow_out, 'dim', [-2], 'keepdim',
+                                              False, 'reduce_all', False)
         else:
             residuals = paddle.empty(shape=[0], dtype=x.dtype)
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 61b2256c3e37e..d96377cccf97a 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -94,7 +94,7 @@ def log(x, name=None):
 
     .. math::
 
-        Out = \\ln(x)
+        Out = \ln(x)
 
     Args:
         x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
@@ -213,7 +213,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 
     .. math::
 
-        out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}
+        out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -424,6 +424,10 @@ def pow(x, y, name=None):
     'elementwise_pow': 'final_state_elementwise_pow',
     'elementwise_floordiv': 'final_state_floor_divide',
     'elementwise_mod': 'final_state_modulo',
+    'elementwise_add': 'final_state_add',
+    'elementwise_sub': 'final_state_subtract',
+    'elementwise_mul': 'final_state_multiply',
+    'elementwise_div': 'final_state_divide',
 }
 
 @dygraph_only
@@ -436,7 +440,7 @@ def _elementwise_op_in_dygraph(x,
     def is_inplace(op_name):
         return  op_name[-1] == "_"
 
-    if op_name not in OP_NAMEMAPPING.keys():
+    if op_name not in OP_NAMEMAPPING.keys() or axis != -1:
         op = getattr(_C_ops, op_name)
         out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
     else:
@@ -1528,7 +1532,9 @@ def mm(input, mat2, name=None):
 
 
     """
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_matmul(input, mat2, False, False)
+    elif paddle.in_dynamic_mode():
         return _C_ops.matmul_v2(input, mat2)
 
     def __check_input(x, y):
@@ -1751,7 +1757,9 @@ def inner(x, y, name=None):
         nx = x.reshape((-1, xshape[-1]))
         ny = y.reshape((-1, yshape[-1]))
 
-        if paddle.in_dynamic_mode():
+        if in_dygraph_mode():
+            return _C_ops.final_state_matmul(nx, ny.T, False, False).reshape(dstshape)
+        elif paddle.in_dynamic_mode():
             return _C_ops.matmul_v2(nx, ny.T).reshape(dstshape)
 
         def __check_input(x, y):
@@ -1814,7 +1822,9 @@ def outer(x, y, name=None):
     nx = x.reshape((-1, 1))
     ny = y.reshape((1, -1))
 
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        return _C_ops.final_state_matmul(nx, ny, False, False)
+    elif paddle.in_dynamic_mode():
         return _C_ops.matmul_v2(nx, ny)
 
     def __check_input(x, y):
@@ -1838,7 +1848,7 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     This OP calculates the log of the sum of exponentials of ``x`` along ``axis`` .
 
     .. math::
-       logsumexp(x) = \\log\\sum exp(x)
+       logsumexp(x) = \log\sum exp(x)
 
     Args:
         x (Tensor): The input Tensor with data type float32 or float64, which 
@@ -2407,7 +2417,7 @@ def log1p(x, name=None):
     Calculates the natural log of the given input tensor, element-wise.
 
     .. math::
-        Out = \\ln(x+1)
+        Out = \ln(x+1)
 
     Args:
         x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
@@ -2445,7 +2455,7 @@ def log2(x, name=None):
 
     .. math::
 
-        Out = \\log_2x
+        Out = \log_2x
 
     Args:
         x (Tensor): Input tensor must be one of the following types: float32, float64.
@@ -2497,7 +2507,7 @@ def log10(x, name=None):
 
     .. math::
 
-        Out = \\log_10_x
+        Out = \log_10_x
 
     Args:
         x (Tensor): Input tensor must be one of the following types: float32, float64.
@@ -2957,7 +2967,7 @@ def cumsum(x, axis=None, dtype=None, name=None):
 
             y = paddle.cumsum(data, dtype='float64')
             print(y.dtype)
-            # VarType.FP64
+            # paddle.float64
     """
     if axis is None:
         flatten = True
@@ -3279,7 +3289,7 @@ def tanh(x, name=None):
     Tanh Activation Operator.
 
     .. math::
-        out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+        out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
 
     Args:
         x (Tensor): Input of Tanh operator, an N-D Tensor, with data type float32, float64 or float16.
@@ -3965,7 +3975,11 @@ def rad2deg(x, name=None):
             #         [57.29578018])
     """
     rad2deg_scale = 180 / np.pi
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.final_state_scale(x, rad2deg_scale, 0.0, True)
+    elif paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
         return _C_ops.scale(x, 'scale', rad2deg_scale)
@@ -4018,7 +4032,11 @@ def deg2rad(x, name=None):
             #         [3.14159274])
     """
     deg2rad_scale = np.pi / 180.0
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if convert_dtype(x.dtype) in ['int32', 'int64']:
+            x = cast(x, dtype="float32")
+        return _C_ops.final_state_scale(x, deg2rad_scale, 0.0, True)
+    elif paddle.in_dynamic_mode():
         if convert_dtype(x.dtype) in ['int32', 'int64']:
             x = cast(x, dtype="float32")
         return _C_ops.scale(x, 'scale', deg2rad_scale)
@@ -4263,14 +4281,22 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
         attrs_1 += ('starts', starts_1)
         ends_1 = [dim_len - 1]
         attrs_1 += ('ends', ends_1)
-        input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
-            'infer_flags', infer_flags, *attrs_1)
+        if in_dygraph_mode():
+            input_front = _C_ops.final_state_slice(new_input, axes, starts_1, ends_1, infer_flags,
+                                            [])
+        else:
+            input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+                'infer_flags', infer_flags, *attrs_1)
         starts_2 = [1]
         attrs_2 += ('starts', starts_2)
         ends_2 = [dim_len]
         attrs_2 += ('ends', ends_2)
-        input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
-            'infer_flags', infer_flags, *attrs_2)
+        if in_dygraph_mode():
+            input_back = input_front = _C_ops.final_state_slice(new_input, axes, starts_2, ends_2, infer_flags,
+                                            [])
+        else:
+            input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
+                'infer_flags', infer_flags, *attrs_2)
 
         if x.dtype == paddle.bool:
             op = getattr(_C_ops, "logical_xor")
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 6855b8f0f7061..d86a6a3f627b3 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -92,7 +92,7 @@ def argsort(x, axis=-1, descending=False, name=None):
             #  [0 2 1 1]]]
     """
     if in_dygraph_mode():
-        _, ids, = _C_ops.final_state_argsort(x, axis, descending)
+        _, ids = _C_ops.final_state_argsort(x, axis, descending)
         return ids
 
     if _in_legacy_dygraph():
@@ -398,7 +398,9 @@ def nonzero(x, as_tuple=False):
     shape = x.shape
     rank = len(shape)
 
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        outs = _C_ops.final_state_where_index(x)
+    elif paddle.in_dynamic_mode():
         outs = _C_ops.where_index(x)
     else:
         helper = LayerHelper("where_index", **locals())
@@ -480,9 +482,13 @@ def sort(x, axis=-1, descending=False, name=None):
             #  [4. 7. 4. 6.]
             #  [5. 7. 7. 9.]]]
     """
-    if paddle.in_dynamic_mode():
-        out, _ = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
-        return out
+    if in_dygraph_mode():
+        outs, _ = _C_ops.final_state_argsort(x, axis, descending)
+        return outs
+
+    if _in_legacy_dygraph():
+        outs, _ = _C_ops.argsort(x, 'axis', axis, 'descending', descending)
+        return outs
     helper = LayerHelper("sort", **locals())
     out = helper.create_variable_for_type_inference(
         dtype=x.dtype, stop_gradient=False)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 41b5fc26fa941..58b80950e5529 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -58,7 +58,7 @@
     func : AdamaxInferMeta
   kernel :
     func : adamax
-  
+
 - api : adamw
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
@@ -460,7 +460,7 @@
 - api : deformable_conv
   args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step)
   output : Tensor(out)
-  infer_meta : 
+  infer_meta :
     func : DeformableConvInferMeta
   kernel :
     func : deformable_conv
@@ -793,6 +793,8 @@
     param : [x, value, dtype]
     data_type : dtype > x
     backend : place > x
+  data_transform :
+    skip_transform : x
 
 - api : gather
   args : (Tensor x, Tensor index, Scalar axis=0)
@@ -1813,6 +1815,8 @@
     func : ShapeInferMeta
   kernel :
     func : shape, shape_sr
+  data_transform:
+    skip_transform : input
 
 # shard_index
 - api : shard_index
@@ -1889,6 +1893,8 @@
     func : SizeInferMeta
   kernel :
     func : size
+  data_transform:
+    skip_transform : x
 
 - api : slice
   args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index a6bd0a10cb1fa..378ead7ff20aa 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -105,7 +105,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'double': 'double',
             'bool': 'bool',
             'str': 'const std::string&',
-            'Place': 'Place',
+            'Place': 'const Place&',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
             'int64_t[]': 'const std::vector<int64_t>&',
@@ -120,7 +120,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
             'bool': 'paddle::optional<bool>',
-            'Place': 'paddle::optional<Place>',
+            'Place': 'paddle::optional<const Place&>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>'
         }
@@ -328,7 +328,7 @@ def gene_kernel_backend_select(self):
                 assert len(
                     vars_list
                 ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in self.attrs['names']) and (self.attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
+                assert (vars_list[0].strip() in self.attrs['names']) and (self.attrs['attr_info'][vars_list[0].strip()][0] == 'const Place&'), \
                     f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
                 backend_select_code = f"""
   kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
@@ -360,7 +360,7 @@ def gene_kernel_select(self) -> str:
         attr_layout_count = 0
         attr_data_type_count = 0
         for attr_name in attrs['names']:
-            if attrs['attr_info'][attr_name][0] == 'Place':
+            if attrs['attr_info'][attr_name][0] == 'const Place&':
                 assert kernel['backend'] is not None, \
                     f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually."
                 attr_backend_count = attr_backend_count + 1
@@ -420,7 +420,7 @@ def gene_kernel_select(self) -> str:
 
         if len(input_names) == 0:
             assert attr_backend_count > 0 and attr_data_type_count > 0, \
-                f"{api} api: When there is no input tensor, the args must have 'Backend' and 'DataType'."
+                f"{api} api: When there is no input tensor, the args must have 'Place' and 'DataType'."
 
         kernel_select_args = ""
         for input_name in input_names:
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 2187d4abb2d63..100d7ad78319b 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -27,6 +27,7 @@
   kernel :
     func : sparse_coo_tensor
     layout : values
+    data_type : values
   backward : create_sparse_coo_tensor_grad
 
 - api : csr_values
diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/python/paddle/utils/code_gen/strings_api_gen.py
index d7117e9d54060..061ea6c3ceef9 100644
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/python/paddle/utils/code_gen/strings_api_gen.py
@@ -225,7 +225,7 @@ def gene_kernel_select(self) -> str:
                 assert len(
                     vars_list
                 ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'const Place&'), \
                     f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
                 kernel_select_code = kernel_select_code + f"""
   kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index ce67912eb2266..b0800a9cd845e 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -59,6 +59,16 @@ API_FILES=("CMakeLists.txt"
            "paddle/scripts/paddle_build.bat"
            "tools/windows/run_unittests.sh"
            "tools/parallel_UT_rule.py"
+           "python/paddle/fluid/dygraph/layers.py"
+           "paddle/fluid/eager/grad_node_info.h"
+           "paddle/fluid/eager/grad_node_info.cc"
+           "paddle/fluid/eager/grad_tensor_holder.h"
+           "paddle/fluid/eager/grad_tensor_holder.cc"
+           "paddle/fluid/eager/tensor_wrapper.h"
+           "paddle/fluid/eager/autograd_meta.cc"
+           "paddle/fluid/eager/autograd_meta.h"
+           "paddle/fluid/eager/backward.cc"
+           "paddle/fluid/eager/backward.h"
            )
 
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
@@ -178,6 +188,9 @@ for API_FILE in ${API_FILES[*]}; do
       elif [ "${API_FILE}" == "python/paddle/fluid/parallel_executor.py" ]; then
           echo_line="You must have one RD (Xreki,luotao1,zhhsplendid) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n"
           check_approval 1 12538138 6836917 7913861
+      elif [ "${API_FILE}" == "python/paddle/fluid/dygraph/layers.py" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_node_info.h" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_node_info.cc" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_tensor_holder.h" ] || [ "${API_FILE}" == "paddle/fluid/eager/grad_tensor_holder.cc" ] || [ "${API_FILE}" == "paddle/fluid/eager/tensor_wrapper.h" ] || [ "${API_FILE}" == "paddle/fluid/eager/autograd_meta.cc"] || [ "${API_FILE}" == "paddle/fluid/eager/autograd_meta.h"] || [ "${API_FILE}" == "paddle/fluid/eager/backward.cc"] || [ "${API_FILE}" == "paddle/fluid/eager/backward.h"]; then
+          echo_line="You must have one RD (JiabinYang,chenwhql,phlrain) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n"
+          check_approval JiabinYang chenwhql phlrain
       else
           echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1,qili93) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 46782768 12538138 6836917 22561442 6888866 16605440
diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu
index f7823738afc53..07535a637431e 100644
--- a/tools/dockerfile/Dockerfile.mlu
+++ b/tools/dockerfile/Dockerfile.mlu
@@ -2,9 +2,9 @@
 # Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
 #
 # Build:
-# - CNTOOLKIT_VERSION 2.6.5-1
-# - CNNL_VERSION 1.8.3-1
-# - CNCL_VERSION 1.0.2-1
+# - CNTOOLKIT_VERSION 2.8.1-1
+# - CNNL_VERSION 1.9.3-1
+# - CNCL_VERSION 1.0.4-1
 #
 # Download three packages from FTP (need to connect cambricon AE to get FTP url)
 # - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
@@ -21,9 +21,9 @@
 # (get cncl pkg)
 #
 # docker build -f Dockerfile.mlu  \
-# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \
-# --build-arg CNNL_VERSION=1.8.3-1 \
-# --build-arg CNCL_VERSION=1.0.2-1 \
+# --build-arg CNTOOLKIT_VERSION=2.8.1-1 \
+# --build-arg CNNL_VERSION=1.9.3-1 \
+# --build-arg CNCL_VERSION=1.0.4-1 \
 # -t paddlepaddle/paddle:latest-dev-mlu .
 #
 # without mlu device:
@@ -40,9 +40,9 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ENV WITH_GPU=OFF
 
-ARG CNTOOLKIT_VERSION=2.6.5-1
-ARG CNNL_VERSION=1.8.3-1
-ARG CNCL_VERSION=1.0.2-1
+ARG CNTOOLKIT_VERSION=2.8.1-1
+ARG CNNL_VERSION=1.9.3-1
+ARG CNCL_VERSION=1.0.4-1
 ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
 ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
 ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 39cc9d684584c..ddb960f7d529e 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -307,6 +307,7 @@
     'test_linear_interp_op',
     'test_linear_interp_v2_op',
     'test_linspace',
+    'test_logspace',
     'test_load_op',
     'test_load_vars_shape_check',
     'test_locality_aware_nms_op',