diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 75966399148d4..47e53e64f592b 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -6,5 +6,6 @@ add_subdirectory(imperative)
 add_subdirectory(operators)
 add_subdirectory(pybind)
 add_subdirectory(eager)
+add_subdirectory(jit)
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 47e3476036d7e..f43493b10fe99 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -136,7 +136,7 @@ int32_t BrpcPsClient::CreateClient2ClientConnection(
     server_ip_port.append(":");
     server_ip_port.append(std::to_string(client_list[i].port));
     _client_channels[i].reset(new brpc::Channel());
-    if (_client_channels[i]->Init(server_ip_port.c_str(), "", &options) != 0) {
+    if (_client_channels[i]->Init(server_ip_port.c_str(), "", &options)) {
       VLOG(0) << "BrpcPSClient connect to Client:" << server_ip_port
               << " Failed! Try again.";
       std::string int_ip_port =
@@ -1195,7 +1195,8 @@ std::future<int32_t> BrpcPsClient::SendClient2ClientMsg(
     int msg_type, int to_client_id, const std::string &msg) {
   auto promise = std::make_shared<std::promise<int32_t>>();
   std::future<int> fut = promise->get_future();
-  if (to_client_id >= _client_channels.size()) {
+  if (to_client_id >= 0 &&
+      static_cast<size_t>(to_client_id) >= _client_channels.size()) {
     VLOG(0) << "to_client_id is out of range clients, which size is "
             << _client_channels.size();
     promise->set_value(-1);
@@ -1778,7 +1779,7 @@ void BrpcPsClient::PushDenseTaskConsume() {
               });
           ++merge_count;
         }
-        for (uint32_t i = 0; i < merge_count; ++i) {
+        for (size_t i = 0; i < merge_count; ++i) {
           merge_status[i].wait();
         }
 
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
index d859acbb42e44..4ca5f9c8207fe 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_server.cc
@@ -713,7 +713,7 @@ int32_t BrpcPsService::CacheShuffle(Table *table,
   };
 
   std::vector<Table *> table_ptrs;
-  for (size_t i = 3; i < request.params_size(); ++i) {
+  for (int i = 3; i < request.params_size(); ++i) {
     int table_id = std::stoi(request.params(i));
     Table *table_ptr = _server->GetTable(table_id);
     table_ptrs.push_back(table_ptr);
diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
index c50f1d909cd95..edbfd06d55a54 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -681,7 +681,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
 
     if (tensor->lod().size() > 0) {
       for (size_t i = 0; i < tensor->lod()[0].size() - 1; ++i) {
-        for (int j = tensor->lod()[0][i]; j < tensor->lod()[0][i + 1];
+        for (size_t j = tensor->lod()[0][i]; j < tensor->lod()[0][i + 1];
              ++j, output_len += fea_dim) {
           uint64_t real_id = static_cast<uint64_t>(ids[j]);
           if (real_id == padding_id) {
@@ -727,7 +727,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
         ++input_idx;
       }
     }
-    CHECK(output_len == g_tensor->numel());
+    CHECK(static_cast<size_t>(output_len) == g_tensor->numel());
   }
 
   std::vector<float *> push_g_vec(input_idx, nullptr);
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index ce9397e511eb0..8128f2b2adbd9 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -547,7 +547,8 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
     seq.push_back(request_idx);
   }
   size_t remote_call_num = request_call_num;
-  if (request2server.size() != 0 && request2server.back() == rank) {
+  if (request2server.size() != 0 &&
+      static_cast<size_t>(request2server.back()) == rank) {
     remote_call_num--;
     local_buffers.resize(node_id_buckets.back().size());
     local_actual_sizes.resize(node_id_buckets.back().size());
@@ -582,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
     for (size_t i = 0; i < node_num; i++) {
       if (fail_num > 0 && failed[seq[i]]) {
         size = 0;
-      } else if (request2server[seq[i]] != rank) {
+      } else if (static_cast<size_t>(request2server[seq[i]]) != rank) {
         res[seq[i]]->copy_and_forward(&size, sizeof(int));
       } else {
         size = local_actual_sizes[local_index++];
@@ -596,7 +597,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
     for (size_t i = 0; i < node_num; i++) {
       if (fail_num > 0 && failed[seq[i]]) {
         continue;
-      } else if (request2server[seq[i]] != rank) {
+      } else if (static_cast<size_t>(request2server[seq[i]]) != rank) {
         char temp[actual_size[i] + 1];
         res[seq[i]]->copy_and_forward(temp, actual_size[i]);
         cntl->response_attachment().append(temp, actual_size[i]);
diff --git a/paddle/fluid/distributed/ps/service/ps_client.cc b/paddle/fluid/distributed/ps/service/ps_client.cc
index a0216f2a7953a..2d02771a2cf8e 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_client.cc
@@ -43,7 +43,7 @@ int32_t PSClient::Configure(
 
   const auto &work_param = _config.worker_param().downpour_worker_param();
 
-  for (size_t i = 0; i < work_param.downpour_table_param_size(); ++i) {
+  for (int i = 0; i < work_param.downpour_table_param_size(); ++i) {
     auto *accessor = CREATE_PSCORE_CLASS(
         ValueAccessor,
         work_param.downpour_table_param(i).accessor().accessor_class());
diff --git a/paddle/fluid/distributed/ps/service/ps_local_client.cc b/paddle/fluid/distributed/ps/service/ps_local_client.cc
index b6407ccebe52b..a52ed1996fff7 100644
--- a/paddle/fluid/distributed/ps/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/ps/service/ps_local_client.cc
@@ -23,7 +23,7 @@ namespace distributed {
 int32_t PsLocalClient::Initialize() {
   const auto& downpour_param = _config.server_param().downpour_server_param();
   TableManager::Instance().Initialize();
-  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+  for (int i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
     auto* table = CREATE_PSCORE_CLASS(
         Table, downpour_param.downpour_table_param(i).table_class());
     table->SetShard(0, 1);
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
index 255c0d3d655aa..fb65e74b62f6f 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc
@@ -51,7 +51,7 @@ void GraphPyService::add_table_feat_conf(std::string table_name,
     int feat_idx = table_feat_mapping[idx][feat_name];
     VLOG(0) << "table_name " << table_name << " mapping id " << idx;
     VLOG(0) << " feat name " << feat_name << " feat id" << feat_idx;
-    if (feat_idx < table_feat_conf_feat_name[idx].size()) {
+    if (static_cast<size_t>(feat_idx) < table_feat_conf_feat_name[idx].size()) {
       // overide
       table_feat_conf_feat_name[idx][feat_idx] = feat_name;
       table_feat_conf_feat_dtype[idx][feat_idx] = feat_dtype;
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 7dd0340125693..877214121e5a0 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -81,14 +81,14 @@ class GraphPyService {
 
     graph_proto->set_table_name("cpu_graph_table");
     graph_proto->set_use_cache(false);
-    for (int i = 0; i < id_to_edge.size(); i++)
+    for (size_t i = 0; i < id_to_edge.size(); i++)
       graph_proto->add_edge_types(id_to_edge[i]);
-    for (int i = 0; i < id_to_feature.size(); i++) {
+    for (size_t i = 0; i < id_to_feature.size(); i++) {
       graph_proto->add_node_types(id_to_feature[i]);
       auto feat_node = id_to_feature[i];
       ::paddle::distributed::GraphFeature* g_f =
           graph_proto->add_graph_feature();
-      for (int x = 0; x < table_feat_conf_feat_name[i].size(); x++) {
+      for (size_t x = 0; x < table_feat_conf_feat_name[i].size(); x++) {
         g_f->add_name(table_feat_conf_feat_name[i][x]);
         g_f->add_dtype(table_feat_conf_feat_dtype[i][x]);
         g_f->add_shape(table_feat_conf_feat_shape[i][x]);
diff --git a/paddle/fluid/distributed/ps/service/server.cc b/paddle/fluid/distributed/ps/service/server.cc
index a6e0f39474b06..e7b3271171ea4 100644
--- a/paddle/fluid/distributed/ps/service/server.cc
+++ b/paddle/fluid/distributed/ps/service/server.cc
@@ -76,7 +76,7 @@ int32_t PSServer::Configure(
   uint32_t barrier_table = UINT32_MAX;
   uint32_t global_step_table = UINT32_MAX;
 
-  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+  for (int i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
     auto *table = CREATE_PSCORE_CLASS(
         Table, downpour_param.downpour_table_param(i).table_class());
 
diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc
index 55a9c794e8ead..d3af468482bfe 100644
--- a/paddle/fluid/distributed/ps/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc
@@ -1205,7 +1205,7 @@ uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) {
 
 int32_t GraphTable::clear_nodes(int type_id, int idx) {
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     search_shards[i]->clear();
   }
   return 0;
@@ -1478,7 +1478,7 @@ std::vector<std::vector<int64_t>> GraphTable::get_all_id(int type_id, int idx,
   std::vector<std::vector<int64_t>> res(slice_num);
   auto &search_shards = type_id == 0 ? edge_shards[idx] : feature_shards[idx];
   std::vector<std::future<std::vector<int64_t>>> tasks;
-  for (int i = 0; i < search_shards.size(); i++) {
+  for (size_t i = 0; i < search_shards.size(); i++) {
     tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
         [&search_shards, i]() -> std::vector<int64_t> {
           return search_shards[i]->get_all_id();
diff --git a/paddle/fluid/distributed/ps/table/memory_dense_table.cc b/paddle/fluid/distributed/ps/table/memory_dense_table.cc
index ab1361eba050f..857850ce50b6a 100644
--- a/paddle/fluid/distributed/ps/table/memory_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_dense_table.cc
@@ -81,8 +81,8 @@ int32_t MemoryDenseTable::InitializeValue() {
 
   fixed_len_params_dim_ = 0;
   for (int x = 0; x < size; ++x) {
-    int dim = common.dims()[x];
-    if (dim != param_dim_) {
+    auto& dim = common.dims()[x];
+    if (static_cast<int>(dim) != param_dim_) {
       fixed_len_params_dim_ += dim;
     } else {
       param_col_ids_.push_back(x);
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 237d0c9424b81..dc77a6c6c51e2 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -625,7 +625,7 @@ int32_t SSDSparseTable::Load(const std::string& path,
 }
 
 //加载path目录下数据[start_idx, end_idx)
-int32_t SSDSparseTable::Load(size_t start_idx, int end_idx,
+int32_t SSDSparseTable::Load(size_t start_idx, size_t end_idx,
                              const std::vector<std::string>& file_list,
                              const std::string& param) {
   if (start_idx >= file_list.size()) {
@@ -699,7 +699,8 @@ int32_t SSDSparseTable::Load(size_t start_idx, int end_idx,
             ssd_values.emplace_back(std::make_pair((char*)data_buffer_ptr,
                                                    value_size * sizeof(float)));
             data_buffer_ptr += feature_value_size;
-            if (ssd_keys.size() == FLAGS_pserver_load_batch_size) {
+            if (static_cast<int>(ssd_keys.size()) ==
+                FLAGS_pserver_load_batch_size) {
               _db->put_batch(local_shard_id, ssd_keys, ssd_values,
                              ssd_keys.size());
               ssd_keys.clear();
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index e6be77a4ba924..3e4d3afe59c3a 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -79,7 +79,7 @@ class SSDSparseTable : public MemorySparseTable {
   virtual int32_t Load(const std::string& path,
                        const std::string& param) override;
   //加载path目录下数据[start_idx, end_idx)
-  virtual int32_t Load(size_t start_idx, int end_idx,
+  virtual int32_t Load(size_t start_idx, size_t end_idx,
                        const std::vector<std::string>& file_list,
                        const std::string& param);
   int64_t LocalSize();
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index 8d6276733e0e5..bddda8f8fff8a 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -536,7 +536,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
     output_len = 0;
 
     if (tensor->lod().size() > 0) {
-      for (int i = 0; i < tensor->lod()[0].size() - 1; ++i) {
+      for (size_t i = 0; i < tensor->lod()[0].size() - 1; ++i) {
         for (size_t j = tensor->lod()[0][i]; j < tensor->lod()[0][i + 1];
              ++j, output_len += fea_dim) {
           uint64_t real_id = static_cast<uint64_t>(ids[j]);
@@ -566,7 +566,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
         }
       }
     } else {
-      for (int i = 0; i < len; ++i, output_len += fea_dim) {
+      for (size_t i = 0; i < len; ++i, output_len += fea_dim) {
         uint64_t real_id = static_cast<uint64_t>(ids[i]);
         if (real_id == padding_id) {
           continue;
@@ -592,7 +592,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
         ++input_idx;
       }
     }
-    CHECK(output_len == g_tensor->numel());
+    CHECK(static_cast<int64_t>(output_len) == g_tensor->numel());
   }
 
   std::vector<float*> push_g_vec(input_idx, nullptr);
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index bade56f239f65..7173c76287096 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -295,7 +295,7 @@ void RunBrpcPushSparse() {
       fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_update_status.wait();
 
-  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+  for (int64_t idx = 0; idx < tensor->numel(); ++idx) {
     EXPECT_FLOAT_EQ(fea_temp_values[idx], fea_values[idx] - 1.0);
   }
 
diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc
index 51254391a4283..bb25fd6991665 100644
--- a/paddle/fluid/distributed/test/ctr_accessor_test.cc
+++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc
@@ -222,15 +222,15 @@ TEST(downpour_feature_value_accessor_test, test_update) {
     v.embed_w = value[i][5];
 
     int idx = 6;
-    for (auto j = 0u; j < acc->common_feature_value.embed_sgd_dim; ++j) {
+    for (int j = 0; j < acc->common_feature_value.embed_sgd_dim; ++j) {
       v.embed_g2sum.push_back(value[i][idx + j]);
     }
     idx += acc->common_feature_value.embed_sgd_dim;
-    for (auto j = 0u; j < acc->common_feature_value.embedx_dim; ++j) {
+    for (int j = 0; j < acc->common_feature_value.embedx_dim; ++j) {
       v.embedx_w.push_back(value[i][idx + j]);
     }
     idx += acc->common_feature_value.embedx_dim;
-    for (auto j = 0u; j < acc->common_feature_value.embedx_sgd_dim; ++j) {
+    for (int j = 0; j < acc->common_feature_value.embedx_sgd_dim; ++j) {
       v.embedx_g2sum.push_back(value[i][idx + j]);
     }
 
@@ -239,7 +239,7 @@ TEST(downpour_feature_value_accessor_test, test_update) {
     push_v.show = grad[i][1];
     push_v.click = grad[i][2];
     push_v.embed_g = grad[i][3];
-    for (auto j = 0; j < parameter.embedx_dim(); ++j) {
+    for (int j = 0; j < parameter.embedx_dim(); ++j) {
       push_v.embedx_g.push_back(grad[i][4 + j]);
     }
 
diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
index 1689b7716bbc4..485d81a7d6856 100644
--- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc
@@ -142,7 +142,7 @@ TEST(MemorySparseTable, SGD) {
   // table->PullSparse(pull_values.data(), value);
 
   for (size_t i = 0; i < init_keys.size(); ++i) {
-    for (size_t j = 2; j < emb_dim + 3; ++j) {
+    for (int j = 2; j < emb_dim + 3; ++j) {
       auto update_val = init_values[i * (emb_dim + 1) + j] -
                         0.1 * total_gradients[3 + i * (emb_dim + 4) + j];
       VLOG(3) << total_gradients[i * (emb_dim + 4) + j + 3] << ":"
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index bb495860c90e5..76331bfe7c90f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1943,6 +1943,7 @@ USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(multiclass_nms3);
 USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(nearest_interp_v2);
+USE_TRT_CONVERTER(bilinear_interp_v2);
 USE_TRT_CONVERTER(reshape);
 USE_TRT_CONVERTER(reduce_sum);
 USE_TRT_CONVERTER(gather_nd);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 4c52d91fa1259..e6c372e205b41 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -52,6 +52,7 @@ list(
   conv3d_op.cc
   mish_op.cc
   nearest_interp_v2_op.cc
+  bilinear_interp_v2_op.cc
   pool3d_op.cc
   deformable_conv_op.cc
   preln_emb_eltwise_layernorm.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 2ef8ec16c76df..9005557a51f3a 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -49,14 +49,30 @@ class ActivationOpConverter : public OpConverter {
         << "convert a fluid Activation op to tensorrt activation layer whose "
            "type is "
         << op_type_;
-    const nvinfer1::ITensor* input_tensor =
-        engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]);
 
     auto op_pair = ops.find(op_type_);
-
-    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
-        op_pair->second);
+    nvinfer1::IActivationLayer* layer = nullptr;
+    if (op_type_ == "softplus") {
+      const float beta = op_desc.HasAttr("beta")
+                             ? BOOST_GET_CONST(float, op_desc.GetAttr("beta"))
+                             : 1.0f;
+      const float threshold =
+          op_desc.HasAttr("threshold")
+              ? BOOST_GET_CONST(float, op_desc.GetAttr("threshold"))
+              : 20.0f;
+      auto* layer_clip = TRT_ENGINE_ADD_LAYER(
+          engine_, Activation, *input_tensor, nvinfer1::ActivationType::kCLIP);
+      layer_clip->setAlpha(-3.40282e+038);
+      layer_clip->setBeta(threshold / beta);
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Activation,
+                                   *layer_clip->getOutput(0), op_pair->second);
+      layer->setAlpha(1.0f / beta);
+      layer->setBeta(beta);
+    } else {
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input_tensor,
+                                   op_pair->second);
+    }
 
 #if IS_TRT_VERSION_GE(5130)
     // max(alpha, min(beta, x))
@@ -64,6 +80,41 @@ class ActivationOpConverter : public OpConverter {
       layer->setAlpha(0.);
       layer->setBeta(6.);
     }
+    if (op_type_ == "elu") {
+      const float alpha = op_desc.HasAttr("alpha")
+                              ? BOOST_GET_CONST(float, op_desc.GetAttr("alpha"))
+                              : 1.0f;
+      layer->setAlpha(alpha);
+    }
+    if (op_type_ == "selu") {
+      const float alpha = op_desc.HasAttr("alpha")
+                              ? BOOST_GET_CONST(float, op_desc.GetAttr("alpha"))
+                              : 1.0507009873554804934193349852946;
+      const float scale = op_desc.HasAttr("scale")
+                              ? BOOST_GET_CONST(float, op_desc.GetAttr("scale"))
+                              : 1.6732632423543772848170429916717;
+      layer->setAlpha(alpha);
+      layer->setBeta(scale);
+    }
+    if (op_type_ == "stanh") {
+      const float scale_a =
+          op_desc.HasAttr("scale_a")
+              ? BOOST_GET_CONST(float, op_desc.GetAttr("scale_a"))
+              : 0.67f;
+      const float scale_b =
+          op_desc.HasAttr("scale_b")
+              ? BOOST_GET_CONST(float, op_desc.GetAttr("scale_b"))
+              : 1.7159f;
+      layer->setAlpha(scale_b);
+      layer->setBeta(scale_a);
+    }
+    if (op_type_ == "thresholded_relu") {
+      const float threshold =
+          op_desc.HasAttr("threshold")
+              ? BOOST_GET_CONST(float, op_desc.GetAttr("threshold"))
+              : 1.0f;
+      layer->setAlpha(threshold);
+    }
 #endif
 
     auto output_name = op_desc.Output("Out")[0];
@@ -83,8 +134,13 @@ const std::unordered_map<std::string, nvinfer1::ActivationType>
         {"tanh", nvinfer1::ActivationType::kTANH},
 #if IS_TRT_VERSION_GE(5130)
         {"relu6", nvinfer1::ActivationType::kCLIP},
+        {"elu", nvinfer1::ActivationType::kELU},
+        {"selu", nvinfer1::ActivationType::kSELU},
+        {"softsign", nvinfer1::ActivationType::kSOFTSIGN},
+        {"softplus", nvinfer1::ActivationType::kSOFTPLUS},
+        {"stanh", nvinfer1::ActivationType::kSCALED_TANH},
+        {"thresholded_relu", nvinfer1::ActivationType::kTHRESHOLDED_RELU}};
 #endif
-};
 
 class ReluOpConverter : public ActivationOpConverter {
  public:
@@ -101,11 +157,43 @@ class TanhOpConverter : public ActivationOpConverter {
   TanhOpConverter() { op_type_ = "tanh"; }
 };
 
+#if IS_TRT_VERSION_GE(5130)
 class Relu6OpConverter : public ActivationOpConverter {
  public:
   Relu6OpConverter() { op_type_ = "relu6"; }
 };
 
+class EluOpConverter : public ActivationOpConverter {
+ public:
+  EluOpConverter() { op_type_ = "elu"; }
+};
+
+class SeluOpConverter : public ActivationOpConverter {
+ public:
+  SeluOpConverter() { op_type_ = "selu"; }
+};
+
+class SoftsignOpConverter : public ActivationOpConverter {
+ public:
+  SoftsignOpConverter() { op_type_ = "softsign"; }
+};
+
+class SoftplusOpConverter : public ActivationOpConverter {
+ public:
+  SoftplusOpConverter() { op_type_ = "softplus"; }
+};
+
+class STanhOpConverter : public ActivationOpConverter {
+ public:
+  STanhOpConverter() { op_type_ = "stanh"; }
+};
+
+class ThreasholdedReluOpConverter : public ActivationOpConverter {
+ public:
+  ThreasholdedReluOpConverter() { op_type_ = "thresholded_relu"; }
+};
+#endif
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
@@ -113,4 +201,12 @@ class Relu6OpConverter : public ActivationOpConverter {
 REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
 REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
 REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
+#if IS_TRT_VERSION_GE(5130)
 REGISTER_TRT_OP_CONVERTER(relu6, Relu6OpConverter);
+REGISTER_TRT_OP_CONVERTER(elu, EluOpConverter);
+REGISTER_TRT_OP_CONVERTER(selu, SeluOpConverter);
+REGISTER_TRT_OP_CONVERTER(softsign, SoftsignOpConverter);
+REGISTER_TRT_OP_CONVERTER(softplus, SoftplusOpConverter);
+REGISTER_TRT_OP_CONVERTER(stanh, STanhOpConverter);
+REGISTER_TRT_OP_CONVERTER(thresholded_relu, ThreasholdedReluOpConverter);
+#endif
diff --git a/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc
new file mode 100644
index 0000000000000..f0e56082b8f77
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/bilinear_interp_v2_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class BilinearInterpolateV2OpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid bilinear_interp_v2 op";
+
+    framework::OpDesc op_desc(op, nullptr);
+
+    std::string input_name = op_desc.Input("X").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto input = engine_->GetITensor(input_name);
+
+    auto data_layout = framework::StringToDataLayout(
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
+    auto interp_method =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method"));
+    bool align_corners =
+        BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners"));
+    auto align_mode = BOOST_GET_CONST(int, op_desc.GetAttr("align_mode"));
+
+    auto resize_inputs = op_desc.Inputs();
+    auto input_names = op_desc.Input("X");
+    auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h"));
+    auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w"));
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input);
+    if (align_mode == 0 && !align_corners) {
+      layer->setResizeMode(nvinfer1::ResizeMode::kLINEAR);
+    }
+
+    auto in_dim = input->getDimensions();
+    float scale_h = 1.f;
+    float scale_w = 1.f;
+
+    // Scales Priority: Scale(tensor) > scale(attr) > out_d/out_h/out_w(attr)
+    bool has_scale_input_attr =
+        (resize_inputs.find("Scale") != resize_inputs.end());
+    bool has_scale_input =
+        has_scale_input_attr && (op_desc.Input("Scale").size() > 0);
+    if (has_scale_input) {
+      auto* scale_var = scope.FindVar(op_desc.Input("Scale")[0]);
+      auto* scale_tensor = scale_var->GetMutable<framework::LoDTensor>();
+      auto* scale_d = scale_tensor->data<float>();
+      scale_h = scale_d[0];
+      scale_w = scale_d[1];
+    } else {
+      const std::vector<float> scale_attr =
+          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("scale"));
+      if (scale_attr.size() > 1) {
+        scale_h = scale_attr[0];
+        scale_w = scale_attr[1];
+      }
+    }
+
+    // axis are different in static/dynamic mode
+    bool with_dynamic = engine_->with_dynamic_shape();
+    int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic;
+    int w_axis =
+        (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic;
+
+    if (scale_w > 0. && scale_h > 0.) {
+      out_h = static_cast<int>(in_dim.d[h_axis] * scale_h);
+      out_w = static_cast<int>(in_dim.d[w_axis] * scale_w);
+    }
+
+    if (out_h > 0 && out_w > 0) {
+      scale_h =
+          static_cast<float>(out_h) / static_cast<float>(in_dim.d[h_axis]);
+      scale_w =
+          static_cast<float>(out_w) / static_cast<float>(in_dim.d[w_axis]);
+    }
+
+    std::vector<float> scales;
+
+    if (engine_->with_dynamic_shape()) {
+      scales.push_back(1.f);
+    }
+
+    if (data_layout == framework::DataLayout::kNCHW) {
+      scales.push_back(1.f);
+      scales.push_back(scale_h);
+      scales.push_back(scale_w);
+    } else if (data_layout == framework::DataLayout::kNHWC) {
+      scales.push_back(scale_h);
+      scales.push_back(scale_w);
+      scales.push_back(1.f);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Data layout must be NCHW or NHWC."));
+    }
+
+    layer->setScales(scales.data(), scales.size());
+    RreplenishLayerAndOutput(layer, "bilinear_interp_v2", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(bilinear_interp_v2, BilinearInterpolateV2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index e6422522e5018..ed113798a7325 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -39,25 +39,60 @@ class ShuffleChannelOpConverter : public OpConverter {
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
     auto input_dims = input->getDimensions();
-
-    int c = input_dims.d[0];
-    int h = input_dims.d[1];
-    int w = input_dims.d[2];
+    auto output_name = op_desc.Output("Out")[0];
     int group = BOOST_GET_CONST(int, op_desc.GetAttr("group"));
 
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
-    layer->setReshapeDimensions(reshape_dim);
-    layer->setSecondTranspose({1, 0, 2, 3});
-    auto* output = layer->getOutput(0);
+#if IS_TRT_VERSION_GE(8000)
+    if (engine_->with_dynamic_shape()) {
+      auto* input_shape_tensor = Shape(input);
+      auto* channel_shape_tensor = GetEleTensorOfShape(input_shape_tensor, 1);
+      auto* group_tensor =
+          Add1DConstantLayer(group, output_name + "_group_tensor_");
+      auto* new_channel_shape_tensor = Div(channel_shape_tensor, group_tensor);
+      std::vector<int32_t> shape_dim3{0, 2, 3};
+      auto* shape_dim3_tensor = Gather(input_shape_tensor, shape_dim3);
 
-    auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output);
-    nvinfer1::Dims3 reshape_dim2(c, h, w);
-    reshape_layer->setReshapeDimensions(reshape_dim2);
+      std::vector<nvinfer1::ITensor*> itensors;
+      itensors.push_back(shape_dim3_tensor);
+      itensors.push_back(group_tensor);
+      itensors.push_back(new_channel_shape_tensor);
+      auto* reshape_tensor = Concat(itensors);
 
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(reshape_layer, "shuffle_channel", {output_name},
-                             test_mode);
+      auto* reshape_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *reshape_tensor);
+      nvinfer1::Permutation transpose_new_input{0, 3, 4, 1, 2};
+      reshape_layer->setSecondTranspose(transpose_new_input);
+
+      auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setInput(1, *(reshape_layer->getOutput(0)));
+      nvinfer1::Permutation transpose_embed{0, 2, 1, 3, 4};
+      layer->setSecondTranspose(transpose_embed);
+      auto* output = layer->getOutput(0);
+      auto* output_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output);
+      output_layer->setInput(1, *input_shape_tensor);
+
+      RreplenishLayerAndOutput(output_layer, "shuffle_channel", {output_name},
+                               test_mode);
+    }
+#endif
+    if (!engine_->with_dynamic_shape()) {
+      int c = input_dims.d[0];
+      int h = input_dims.d[1];
+      int w = input_dims.d[2];
+
+      auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
+      layer->setReshapeDimensions(reshape_dim);
+      layer->setSecondTranspose({1, 0, 2, 3});
+      auto* output = layer->getOutput(0);
+
+      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output);
+      nvinfer1::Dims3 reshape_dim2(c, h, w);
+      reshape_layer->setReshapeDimensions(reshape_dim2);
+
+      RreplenishLayerAndOutput(reshape_layer, "shuffle_channel", {output_name},
+                               test_mode);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 6ce9b9c0bf85a..d6aa04612d648 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -73,6 +73,12 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "elu",
+      "selu",
+      "softsign",
+      "softplus",
+      "stanh",
+      "thresholded_relu",
       "exp",
       "log",
       "sqrt",
@@ -138,6 +144,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv3d_transpose",
       "mish",
       "nearest_interp_v2",
+      "bilinear_interp_v2",
       "pool3d",
       "deformable_conv",
       "relu6",
@@ -163,6 +170,12 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "elu",
+      "selu",
+      "softsign",
+      "softplus",
+      "stanh",
+      "thresholded_relu",
       "exp",
       "log",
       "sqrt",
@@ -227,6 +240,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv3d",
       "conv3d_transpose",
       "mish",
+      "bilinear_interp_v2",
       "nearest_interp_v2",
       "pool3d",
       "deformable_conv",
@@ -261,30 +275,16 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     return false;
 
   for (auto& teller : tellers_) {
-    std::unordered_set<std::string> act_op_list = {"relu",
-                                                   "elu",
-                                                   "selu",
-                                                   "softsign",
-                                                   "softplus",
-                                                   "stanh",
-                                                   "thresholded_relu",
-                                                   "exp",
-                                                   "log",
-                                                   "sqrt",
-                                                   "abs",
-                                                   "sin",
-                                                   "cos",
-                                                   "tan",
-                                                   "sinh",
-                                                   "cosh",
-                                                   "asin",
-                                                   "acos",
-                                                   "atan",
-                                                   "asinh",
-                                                   "atanh",
-                                                   "ceil",
-                                                   "floor",
-                                                   "erf"};
+    std::unordered_set<std::string> act_op_list = {
+        "relu",     "relu6", "sigmoid",
+        "elu",      "selu",  "softsign",
+        "softplus", "stanh", "thresholded_relu",
+        "exp",      "log",   "sqrt",
+        "abs",      "sin",   "cos",
+        "tan",      "tanh",  "sinh",
+        "cosh",     "asin",  "acos",
+        "atan",     "asinh", "atanh",
+        "ceil",     "floor", "erf"};
     if (act_op_list.find(op_type) != act_op_list.end()) {
       auto* block = desc.Block();
       if (block == nullptr) {
@@ -877,6 +877,99 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "bilinear_interp_v2") {
+      std::vector<std::string> attrs{"data_layout",   "interp_method",
+                                     "align_corners", "scale",
+                                     "out_h",         "out_w"};
+      for (auto const attr : attrs) {
+        if (!desc.HasAttr(attr)) {
+          VLOG(3) << "The op_type " << op_type << " doesn't have the attr "
+                  << attr << " and return false";
+          return false;
+        }
+      }
+
+      auto resize_inputs = desc.Inputs();
+      if (resize_inputs.find("SizeTensor") != resize_inputs.end()) {
+        if (desc.Input("SizeTensor").size() >= 1) {
+          VLOG(3)
+              << "The Paddle-TRT doesn't support the SizeTensor for op_type "
+              << op_type;
+          return false;
+        }
+      }
+
+      if (resize_inputs.find("OutSize") != resize_inputs.end()) {
+        if (desc.Input("OutSize").size() >= 1) {
+          VLOG(3) << "The Paddle-TRT doesn't support the OutSize for op_type "
+                  << op_type;
+          return false;
+        }
+      }
+
+      auto data_layout = framework::StringToDataLayout(
+          BOOST_GET_CONST(std::string, desc.GetAttr("data_layout")));
+      if (data_layout != framework::DataLayout::kNCHW &&
+          data_layout != framework::DataLayout::kNHWC) {
+        VLOG(3) << "The op_type " << op_type
+                << " is not NCHW or NHWC return false";
+        return false;
+      }
+      auto interp_method =
+          BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
+      if (interp_method != "bilinear") {
+        VLOG(3) << "The interp_method of op_type " << op_type
+                << " is not bilinear";
+        return false;
+      }
+
+      auto align_corners = BOOST_GET_CONST(bool, desc.GetAttr("align_corners"));
+      if (align_corners != false) {
+        VLOG(3)
+            << "The bilinear_interp_v2 only supports align_corners with false.";
+        return false;
+      }
+
+      bool has_scale_input_size =
+          (resize_inputs.find("Scale") != resize_inputs.end());
+
+      if (has_scale_input_size && desc.Input("Scale").size() != 1) {
+        const std::vector<float> scale =
+            BOOST_GET_CONST(std::vector<float>, desc.GetAttr("scale"));
+        if (scale.size() <= 1) {
+          if (!desc.HasAttr("out_h") || !desc.HasAttr("out_w")) {
+            VLOG(3) << "The op_type " << op_type
+                    << " doesn't have Scale and the scale size <=1 and without "
+                       "out_h / out_w, it will return false";
+            return false;
+          }
+          auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
+          auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
+          if (!(out_h <= 0 && out_w <= 0)) {
+            if (out_h <= 0) {
+              VLOG(3) << "The op_type " << op_type
+                      << "'s out_h must be greater than 0 if scale is not set.";
+              return false;
+            }
+            if (out_w <= 0) {
+              VLOG(3) << "The op_type " << op_type
+                      << "'s out_w must be greater than 0 if scale is not set.";
+              return false;
+            }
+          }
+        } else {
+          for (size_t i = 0; i < scale.size(); i++) {
+            if (scale[i] <= 0 && with_dynamic_shape) {
+              VLOG(3) << "dynamic shape not support Attr(scale[" << i << "]) "
+                      << scale[i]
+                      << " less than 1 and Input(Scale) vector not set.";
+              return false;
+            }
+          }
+        }
+      }
+    }
+
     if (op_type == "hard_swish") {
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "HardSwish op has only 1 input, but got "
@@ -1511,11 +1604,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "shuffle_channel") {
+#if !IS_TRT_VERSION_GE(8000)
       if (with_dynamic_shape) {
         VLOG(3) << "You are running the TRT Dynamic Shape mode, "
-                   "the shuffle_channel op does not support dynamic shape yet";
+                   "the shuffle_channel op does not support dynamic shape "
+                   "trt versions below 8.0 yet";
         return false;
       }
+#endif
     }
 
     if (op_type == "skip_layernorm") {
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
index f27b66b03f544..c53ae6d118470 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include <thrust/device_vector.h>
-
 #include <string>
 #include <utility>
 #include <vector>
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 1cfc9fade7b15..0150564e58206 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <cuda_fp16.h>
+#include <thrust/device_vector.h>
 
 #include <algorithm>
 
@@ -63,9 +64,7 @@ void SplitPlugin::shareData(const SplitPlugin* another) {
   inner_cols_ = another->inner_cols_;
   same_shape_ = another->same_shape_;
   axis_shape_ = another->axis_shape_;
-  d_segment_offsets_ = another->d_segment_offsets_;
   segment_offsets_ = another->segment_offsets_;
-  d_output_ptrs_.resize(another->d_output_ptrs_.size(), nullptr);
 }
 
 int SplitPlugin::initialize() TRT_NOEXCEPT {
@@ -93,9 +92,7 @@ int SplitPlugin::initialize() TRT_NOEXCEPT {
     segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
   }
   axis_shape_ = dims.d[axis_];
-  d_segment_offsets_ = segment_offsets;
   segment_offsets_ = std::move(segment_offsets);
-  d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
   return 0;
 }
 
@@ -133,13 +130,18 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void* const* outputs, void* workspace,
                          cudaStream_t stream) TRT_NOEXCEPT {
 #endif
+  // this two thrust variables decalared here , not with in .h
+  // to avoid compiling error in cuda 11.6
+  thrust::device_vector<int> d_segment_offsets = segment_offsets_;
+  thrust::device_vector<float*> d_output_ptrs;
+  d_output_ptrs.resize(segment_offsets_.size(), nullptr);
   const int* d_segment_offsets_ptr =
-      thrust::raw_pointer_cast(&d_segment_offsets_[0]);
+      thrust::raw_pointer_cast(&d_segment_offsets[0]);
   float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
   float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
-  float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
+  float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs[0]);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
-      output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*),
+      output_ptrs, h_odatas, d_output_ptrs.size() * sizeof(float*),
       cudaMemcpyHostToDevice, stream));
 
   int outer_rows = outer_rows_ * batchSize;
@@ -150,7 +152,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
             std::min((outer_rows_ - 1) / block.z + 1, 65535u));
 
   split_kernel<<<grid, block, 0, stream>>>(
-      d_segment_offsets_.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
+      segment_offsets_.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
       inner_cols_, axis_shape_, outer_rows);
   return cudaGetLastError() != cudaSuccess;
 }
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 49f028493ee87..93dc45215d4ee 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -14,8 +14,6 @@
 
 #pragma once
 
-#include <thrust/device_vector.h>
-
 #include <string>
 #include <utility>
 #include <vector>
@@ -94,8 +92,6 @@ class SplitPlugin : public PluginTensorRTV2Ext {
   bool same_shape_;
   std::vector<int> output_length_;
   std::vector<int> segment_offsets_;
-  thrust::device_vector<int> d_segment_offsets_;
-  thrust::device_vector<float*> d_output_ptrs_;
 
  private:
   void shareData(const SplitPlugin* another);
diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt
new file mode 100644
index 0000000000000..b44060c0fad52
--- /dev/null
+++ b/paddle/fluid/jit/CMakeLists.txt
@@ -0,0 +1,38 @@
+cc_library(
+  jit_serializer
+  SRCS serializer.cc
+  DEPS lod_tensor device_context)
+
+cc_library(
+  jit_layer
+  SRCS layer.cc
+  DEPS executor parallel_executor executor_cache)
+
+cc_library(
+  jit_base_function
+  SRCS base_function.cc
+  DEPS scope proto_desc)
+
+if(WITH_TESTING AND NOT WIN32)
+  add_custom_target(
+    jit_download_program
+    COMMAND wget -nc https://paddle-ci.gz.bcebos.com/dy2st/Testing.tar.gz
+    COMMAND tar zxvf Testing.tar.gz)
+  set(JIT_DEPS
+      phi
+      elementwise_add_op
+      matmul_v2_op
+      activation_op
+      reduce_mean_op
+      feed_op
+      fetch_op
+      scale_op
+      jit_serializer
+      jit_layer
+      jit_base_function)
+  cc_test(
+    layer_test
+    SRCS layer_test.cc
+    DEPS ${JIT_DEPS})
+  add_dependencies(layer_test jit_download_program)
+endif()
diff --git a/paddle/fluid/jit/ast.h b/paddle/fluid/jit/ast.h
new file mode 100644
index 0000000000000..535b3a89dd60f
--- /dev/null
+++ b/paddle/fluid/jit/ast.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace jit {
+using Variable = paddle::framework::Variable;
+class BaseFunction;
+class CompilationUnit;
+
+class ClassType {
+ public:
+  ClassType(const std::vector<std::string>& names,
+            std::weak_ptr<CompilationUnit> cu)
+      : const_names_(names), compilation_unit_(cu) {}
+
+  static std::shared_ptr<ClassType> Create(
+      const std::vector<std::string>& names,
+      std::weak_ptr<CompilationUnit> cu) {
+    return std::make_shared<ClassType>(names, cu);
+  }
+
+  // const std::vector<Function*> Methods() const;
+
+  // const Variable& GetAttribute(size_t slot) const;
+  // const Variable& GetAttribute(const std::string& name) const;
+
+  // size_t AddAttribute(const std::string& name, Variable val);
+
+ private:
+  // TODO(dev): disingwish parameter and buffer
+  std::vector<std::string> const_names_;
+  std::vector<Variable> const_value_;
+
+  std::vector<BaseFunction*> methods_;
+  std::vector<BaseFunction*> static_method_;
+  std::weak_ptr<CompilationUnit> compilation_unit_;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/base_function.cc b/paddle/fluid/jit/base_function.cc
new file mode 100644
index 0000000000000..fcbe64de8d70d
--- /dev/null
+++ b/paddle/fluid/jit/base_function.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/jit/base_function.h"
+
+namespace paddle {
+namespace jit {
+
+Argument::Argument(const std::string &name, bool is_out)
+    : name_(name), is_output_(is_out) {}
+
+const std::string &Argument::Name() const { return name_; }
+
+std::vector<std::string> FunctionSchema::GetInputArgNames() {
+  std::vector<std::string> input_arg_names;
+  for (auto &arg : input_args) {
+    input_arg_names.emplace_back(arg.Name());
+  }
+  return input_arg_names;
+}
+
+std::vector<std::string> FunctionSchema::GetOutputArgNames() {
+  std::vector<std::string> output_arg_names;
+  for (auto &arg : output_args) {
+    output_arg_names.emplace_back(arg.Name());
+  }
+  return output_arg_names;
+}
+
+void FunctionSchema::AddInputArg(std::string name, bool is_output) {
+  input_args.emplace_back(name, is_output);
+}
+
+void FunctionSchema::AddOutputArg(std::string name, bool is_output) {
+  output_args.emplace_back(name, is_output);
+}
+
+BaseFunction::BaseFunction(
+    const framework::ProgramDesc &program_desc,
+    const std::vector<std::string> param_names_for_program,
+    const VariableNameMap &params_dict)
+    : program_desc_(program_desc) {
+  // Parse FunctionSchema
+  // skip_var_name_ = program_desc_.GetFetchTargetNames();
+  for (auto &in_name : program_desc_.GetFeedTargetNames()) {
+    schema_.AddInputArg(in_name, false);
+  }
+  for (auto &out_name : program_desc_.GetFetchTargetNames()) {
+    schema_.AddOutputArg(out_name, true);
+  }
+  // share params into scope
+  SharePartialIntoScope(param_names_for_program, params_dict);
+  VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
+  // remove feed fetch op
+  RemoveFeedFetch();
+}
+
+void BaseFunction::FetchOutput(std::vector<Variable> *outs) {
+  for (auto &out_name : schema_.GetOutputArgNames()) {
+    VLOG(3) << "fetch out: " << out_name;
+    auto *var = scope_.FindVar(out_name);
+    auto &src_tensor = var->Get<phi::DenseTensor>();
+    Variable v;
+    auto *p = v.GetMutable<DenseTensor>();
+    *p = src_tensor;
+    outs->emplace_back(v);
+  }
+}
+
+void BaseFunction::ShareIntoScope(const VariableNameMap &ivals) {
+  VLOG(3) << "ivals size: " << ivals.size();
+  for (auto it = ivals.begin(); it != ivals.end(); ++it) {
+    VLOG(3) << "share into scope: " << it->first;
+    DenseTensor dense_tensor = it->second.Get<DenseTensor>();
+    auto *var = scope_.Var(it->first);
+    auto *dst_tensor = var->GetMutable<DenseTensor>();
+    *dst_tensor = dense_tensor;
+  }
+}
+
+void BaseFunction::SharePartialIntoScope(
+    const std::vector<std::string> param_names_for_program,
+    const VariableNameMap &params_dict) {
+  VLOG(3) << "ivals size: " << param_names_for_program.size();
+  for (size_t i = 0; i < param_names_for_program.size(); ++i) {
+    std::string name = param_names_for_program[i];
+    Variable val = params_dict.find(name)->second;
+    auto &dense_tensor = val.Get<DenseTensor>();
+    VLOG(3) << "share into scope: " << name;
+    auto *var = scope_.Var(name);
+    auto *dst_tensor = var->GetMutable<DenseTensor>();
+    *dst_tensor = dense_tensor;
+  }
+}
+
+void BaseFunction::RemoveFeedFetch() {
+  for (size_t i = 0; i < program_desc_.Size(); ++i) {
+    auto *block = program_desc_.MutableBlock(i);
+    const auto &all_ops = block->AllOps();
+    size_t op_size = all_ops.size();
+    VLOG(3) << "op_size: " << op_size;
+    for (int i = op_size - 1; i >= 0; i--) {
+      auto op = all_ops[i];
+      if (op->Type() == "feed" || op->Type() == "fetch") {
+        VLOG(3) << "remove op type: " << op->Type() << ", index: " << i;
+        block->RemoveOp(i, i + 1);
+      }
+    }
+  }
+}
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/base_function.h b/paddle/fluid/jit/base_function.h
new file mode 100644
index 0000000000000..3d4f9a29eb6b1
--- /dev/null
+++ b/paddle/fluid/jit/base_function.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace paddle {
+namespace jit {
+
+using Variable = paddle::framework::Variable;
+using VariableNameMap = std::map<std::string, Variable>;
+using DenseTensor = phi::DenseTensor;
+
+class Argument {
+ public:
+  explicit Argument(const std::string &name, bool is_out = false);
+
+  const std::string &Name() const;
+
+ private:
+  std::string name_;
+  // paddle::optional<Variable> default_val_;
+  bool is_output_;
+};
+
+class FunctionSchema {
+ public:
+  FunctionSchema() = default;
+
+  std::vector<std::string> GetInputArgNames();
+
+  std::vector<std::string> GetOutputArgNames();
+
+  void AddInputArg(std::string name, bool is_output);
+
+  void AddOutputArg(std::string name, bool is_output);
+
+ private:
+  std::vector<Argument> input_args;
+  std::vector<Argument> output_args;
+};
+
+// TODO(dev): make it as abstract class
+class BaseFunction {
+ public:
+  BaseFunction(const framework::ProgramDesc &program_desc,
+               const std::vector<std::string> param_names_for_program,
+               const VariableNameMap &params_dict);
+
+  virtual ~BaseFunction() {}
+
+  virtual std::vector<Variable> operator()(const VariableNameMap &inputs) = 0;
+
+ protected:
+  void FetchOutput(std::vector<Variable> *outs);
+
+  void ShareIntoScope(const VariableNameMap &ivals);
+
+  void SharePartialIntoScope(
+      const std::vector<std::string> param_names_for_program,
+      const VariableNameMap &params_dict);
+
+  void RemoveFeedFetch();
+
+ protected:
+  framework::ProgramDesc program_desc_;
+  // TODO(dev): need a better way to share params
+  // std::vector<Variable> &param_for_program_;
+  // std::vector<std::string> skip_var_name_;
+  FunctionSchema schema_;
+  // global_scope place params
+  framework::Scope scope_;
+  //   framework::Executor inner_exe_;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h
new file mode 100644
index 0000000000000..815e9d3f4c090
--- /dev/null
+++ b/paddle/fluid/jit/compilation_unit.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace jit {
+class BaseFunction;
+
+class CompilationUnit {
+ public:
+  CompilationUnit() = default;
+  ~CompilationUnit() {}
+
+ private:
+  std::vector<std::unique_ptr<BaseFunction>> functions_;
+  std::unordered_map<std::string, size_t> functions_idx_;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/exector_function.h b/paddle/fluid/jit/exector_function.h
new file mode 100644
index 0000000000000..3217c62fbd797
--- /dev/null
+++ b/paddle/fluid/jit/exector_function.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/jit/base_function.h"
+
+namespace paddle {
+namespace jit {
+
+class ExectorFunction : public BaseFunction {
+ public:
+  ExectorFunction(const framework::ProgramDesc &program_desc,
+                  const std::vector<std::string> param_names_for_program,
+                  const VariableNameMap &params_dict)
+      : BaseFunction(program_desc, param_names_for_program, params_dict),
+        inner_exe_(phi::CPUPlace()) {}
+
+  ~ExectorFunction() {}
+
+  std::vector<Variable> operator()(const VariableNameMap &inputs) {
+    // share input into scope
+    ShareIntoScope(inputs);
+    // run program
+    inner_exe_.Run(program_desc_, &scope_, /*blockID=*/0, false, true,
+                   schema_.GetOutputArgNames());
+    VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
+    // fetch outputs
+    std::vector<Variable> res;
+    FetchOutput(&res);
+    return res;
+  }
+
+ private:
+  // TODO(dev): support other devices exe
+  framework::Executor inner_exe_;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
new file mode 100644
index 0000000000000..cb13a003affec
--- /dev/null
+++ b/paddle/fluid/jit/layer.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/jit/layer.h"
+
+namespace paddle {
+namespace jit {
+// TODO(dev): Make vector<string>, num_slot as in argument
+// Layer(const std::shared_ptr<ClassType>& type) : obj_(type, /*num_slot*/ 0U)
+// {}
+Layer::Layer(
+    const std::vector<std::string>& func_names,
+    const std::vector<framework::ProgramDesc>& program_descs,
+    const std::vector<std::vector<std::string>>& param_names_for_each_program,
+    const VariableNameMap& params_dict) {
+  VLOG(3) << "program size: " << program_descs.size();
+  // Layer manage the life time of all parameter.
+  for (size_t i = 0; i < func_names.size(); ++i) {
+    // TODO(dev): choose exector or pe by flag
+    function_dict[func_names[i]] = std::make_shared<ExectorFunction>(
+        program_descs[i], param_names_for_each_program[i], params_dict);
+  }
+}
+
+// TODO(dev): make it as const function
+std::shared_ptr<BaseFunction> Layer::GetFunction(const std::string& name) {
+  VLOG(3) << "funcs_ size: " << function_dict.size();
+  return function_dict[name];
+}
+
+std::vector<Variable> Layer::forward(const VariableNameMap& inputs) {
+  auto func = GetFunction("forward");
+  return (*func)(inputs);
+}
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h
new file mode 100644
index 0000000000000..0c2ad49c77197
--- /dev/null
+++ b/paddle/fluid/jit/layer.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/jit/ast.h"
+#include "paddle/fluid/jit/base_function.h"
+#include "paddle/fluid/jit/compilation_unit.h"
+#include "paddle/fluid/jit/exector_function.h"
+#include "paddle/fluid/jit/object.h"
+#include "paddle/fluid/jit/pe_function.h"
+
+namespace paddle {
+namespace jit {
+using Variable = paddle::framework::Variable;
+using VariableNameMap = std::map<std::string, Variable>;
+using DenseTensor = phi::DenseTensor;
+
+class Layer {
+ public:
+  // TODO(dev): Make vector<string>, num_slot as in argument
+  // Layer(const std::shared_ptr<ClassType>& type) : obj_(type, /*num_slot*/ 0U)
+  // {}
+  Layer(
+      const std::vector<std::string>& func_names,
+      const std::vector<framework::ProgramDesc>& program_descs,
+      const std::vector<std::vector<std::string>>& param_names_for_each_program,
+      const VariableNameMap& params_dict);
+
+  // TODO(dev): make it as const function
+  std::shared_ptr<BaseFunction> GetFunction(const std::string& name);
+
+  std::vector<Variable> forward(const VariableNameMap& inputs);
+
+ private:
+  // internal::Object obj_;
+  // std::vector<framework::ProgramDesc> all_program_desc_;
+  // std::vector<std::vector<std::string>> param_name_for_each_program_;
+  // std::vector<Variable> all_param_;
+  std::map<std::string, std::shared_ptr<BaseFunction>> function_dict;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc
new file mode 100644
index 0000000000000..9386569d48d1b
--- /dev/null
+++ b/paddle/fluid/jit/layer_test.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/jit/layer.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iterator>
+#include <string>
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/jit/serializer.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+USE_OP_ITSELF(elementwise_add);
+USE_OP_ITSELF(matmul_v2);
+USE_OP_ITSELF(relu);
+USE_OP_ITSELF(reduce_mean);
+USE_OP_ITSELF(feed);
+USE_OP_ITSELF(fetch);
+USE_OP_ITSELF(scale);
+
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace jit {
+
+VariableNameMap PrepareInputs() {
+  auto temp = DenseTensor();
+  temp.Resize(phi::make_ddim({2, 4}));
+  phi::CPUContext cpu_ctx;
+  cpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(paddle::platform::CPUPlace())
+                           .get());
+  cpu_ctx.Init();
+  cpu_ctx.Alloc<float>(&temp);
+  phi::funcs::set_constant(cpu_ctx, &temp, 2.);
+  Variable v;
+  auto *p = v.GetMutable<DenseTensor>();
+  *p = temp;
+  // TODO(dev): associate the input name
+  return {{"x", v}};
+}
+
+TEST(layer, Construct) {
+  std::string path = "./Testing/";
+  auto layer = jit::Load(path);
+  auto inputs = PrepareInputs();
+
+  auto outs = layer.forward(inputs);
+  auto out_vars = outs[0];
+  auto out_dense_tensor = out_vars.Get<DenseTensor>();
+  auto out_data = out_dense_tensor.data<float>();
+  EXPECT_NEAR(out_data[0], 0.02194316, 1e-6);
+
+  auto func = layer.GetFunction("infer");
+  outs = (*func)(inputs);
+  out_vars = outs[0];
+  out_dense_tensor = out_vars.Get<DenseTensor>();
+  out_data = out_dense_tensor.data<float>();
+  EXPECT_NEAR(out_data[0], 1.41562390, 1e-6);
+}
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/object.h b/paddle/fluid/jit/object.h
new file mode 100644
index 0000000000000..94aae67376007
--- /dev/null
+++ b/paddle/fluid/jit/object.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace jit {
+class ClassType;
+
+namespace internal {
+
+class Object {
+ public:
+  Object(const std::shared_ptr<ClassType>& type, size_t num_slot)
+      : type_(type) {
+    slots_.resize(num_slot);
+  }
+
+  static std::unique_ptr<Object> Create(std::shared_ptr<ClassType> type,
+                                        size_t num_slot) {
+    return std::make_unique<Object>(type, num_slot);
+  }
+
+  std::shared_ptr<ClassType> Type() const { return type_; }
+
+  void SetSlot(size_t slot, Variable val) {
+    if (slot >= slots_.size()) {
+      slots_.resize(slot);
+    }
+    slots_[slot] = std::move(val);
+  }
+
+  const Variable& GetSlot(size_t slot) {
+    // TODO(dev): Add ENFORCE_LT(slot, size());
+    return slots_[slot];
+  }
+
+  Variable GetAttr(const std::string& name) const;
+
+  void SetAttr(const std::string& name, Variable val);
+
+ private:
+  std::shared_ptr<ClassType> type_;
+  // Store Tensors and Attributes
+  std::vector<Variable> slots_;
+};
+
+}  // namespace internal
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/pe_function.h b/paddle/fluid/jit/pe_function.h
new file mode 100644
index 0000000000000..a3d7eb33f7103
--- /dev/null
+++ b/paddle/fluid/jit/pe_function.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/executor_cache.h"
+#include "paddle/fluid/jit/base_function.h"
+
+namespace paddle {
+namespace jit {
+
+class PEFunction : public BaseFunction {
+ public:
+  PEFunction(const framework::ProgramDesc &program_desc,
+             const std::vector<std::string> param_names_for_program,
+             const VariableNameMap &params_dict)
+      : BaseFunction(program_desc, param_names_for_program, params_dict) {}
+
+  ~PEFunction() {}
+
+  std::vector<Variable> operator()(const VariableNameMap &inputs) {
+    // bool is_test = true;
+    std::string prog_string;
+    std::hash<std::string> string_hash;
+    program_desc_.Proto()->SerializePartialToString(&prog_string);
+    int64_t program_id = static_cast<int64_t>(string_hash(prog_string));
+    const framework::BlockDesc &global_block = program_desc_.Block(0);
+    int64_t start_op_index = 0;
+    int64_t end_op_index = static_cast<int64_t>(global_block.OpSize());
+
+    ShareIntoScope(inputs);
+    std::vector<std::string> input_var_names = schema_.GetInputArgNames();
+    std::vector<std::string> output_var_names = schema_.GetOutputArgNames();
+    std::vector<std::string> dout_var_names;
+    if (end_op_index > start_op_index) {
+      // TODO(dev): support other devices
+      auto cache_info = framework::GetExecutorInfoFromCache(
+          program_desc_, phi::CPUPlace(), start_op_index, end_op_index,
+          /*is_grad=*/false, program_id, &scope_);
+      auto &parallel_executor = cache_info.first;
+      auto &skip_eager_delete_vars =
+          framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars(
+              program_id, false);
+      if (cache_info.second /*is_new_created*/) {
+        parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_var_names);
+        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                      output_var_names.begin(),
+                                      output_var_names.end());
+        skip_eager_delete_vars.insert(skip_eager_delete_vars.end(),
+                                      dout_var_names.begin(),
+                                      dout_var_names.end());
+        framework::details::ParseSafeEagerDeletionSkipVars(
+            program_desc_, end_op_index, output_var_names,
+            &skip_eager_delete_vars);
+      }
+      parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
+    }
+    VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
+    std::vector<Variable> res;
+    FetchOutput(&res);
+    return res;
+  }
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
new file mode 100644
index 0000000000000..a8bd934d12e5f
--- /dev/null
+++ b/paddle/fluid/jit/serializer.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/jit/serializer.h"
+
+namespace paddle {
+namespace jit {
+
+Layer Deserializer::operator()(const std::string& dir_path) {
+  const auto& file_name_prefixs = GetPdmodelFileNamePrefix(dir_path);
+  std::vector<std::string> func_names;
+  std::vector<framework::ProgramDesc> program_descs;
+  std::vector<std::vector<std::string>> param_names_for_each_program;
+  // set is ordered
+  std::set<std::string> param_names_set;
+  VariableNameMap params_dict;
+  for (auto& it : file_name_prefixs) {
+    func_names.emplace_back(it.first);
+
+    auto program = LoadProgram(dir_path + it.second + PDMODEL_SUFFIX);
+    program_descs.emplace_back(program);
+
+    // TODO(dev): load int/float params
+    std::vector<std::string> persistable_var_names;
+    auto all_var_desc = program.Block(0).AllVars();
+    for (auto* desc_ptr : all_var_desc) {
+      if (IsPersistable(desc_ptr)) {
+        persistable_var_names.emplace_back(desc_ptr->Name());
+      }
+    }
+
+    param_names_for_each_program.emplace_back(persistable_var_names);
+    param_names_set.insert(persistable_var_names.begin(),
+                           persistable_var_names.end());
+  }
+
+  // Read from one pdiparams file, refine here
+  auto params_for_all_program =
+      ReadTensorData(dir_path + "export.forward.pdiparams", param_names_set);
+  params_dict.insert(params_for_all_program.begin(),
+                     params_for_all_program.end());
+
+  return Layer(func_names, program_descs, param_names_for_each_program,
+               params_dict);
+}
+
+bool Deserializer::IsPersistable(framework::VarDesc* desc_ptr) {
+  auto type = desc_ptr->GetType();
+  if (type == framework::proto::VarType::FEED_MINIBATCH ||
+      type == framework::proto::VarType::FETCH_LIST ||
+      type == framework::proto::VarType::READER ||
+      type == framework::proto::VarType::RAW) {
+    return false;
+  }
+  return desc_ptr->Persistable();
+}
+
+bool Deserializer::EndsWith(const std::string& str, const std::string& suffix) {
+  if (str.length() < suffix.length()) {
+    return false;
+  }
+  return str.compare(str.length() - suffix.length(), suffix.length(), suffix) ==
+         0;
+}
+
+const std::vector<std::pair<std::string, std::string>>
+Deserializer::GetPdmodelFileNamePrefix(const std::string& path) {
+  std::vector<std::pair<std::string, std::string>> file_name_prefixs;
+  DIR* dir = opendir(path.c_str());
+  struct dirent* ptr;
+  while ((ptr = readdir(dir)) != nullptr) {
+    std::string file_name = ptr->d_name;
+    if (EndsWith(file_name, PDMODEL_SUFFIX)) {
+      std::string prefix = file_name.substr(
+          0, file_name.length() - std::string(PDMODEL_SUFFIX).length());
+      std::string func_name = prefix.substr(prefix.find_first_of(".") + 1);
+      file_name_prefixs.emplace_back(std::make_pair(func_name, prefix));
+    }
+  }
+  closedir(dir);
+  return file_name_prefixs;
+}
+
+VariableNameMap Deserializer::ReadTensorData(
+    const std::string& file_name, const std::set<std::string>& var_name) const {
+  VLOG(3) << "ReadTensorData from: " << file_name;
+  std::ifstream fin(file_name, std::ios::binary);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  // TODO(dev): Support other devices
+  auto& dev_ctx = *pool.Get(phi::CPUPlace());
+  VariableNameMap res;
+  for (auto it = var_name.begin(); it != var_name.end(); it++) {
+    VLOG(3) << "load Tensor: " << *it;
+    Variable v;
+    // TODO(dev): Support framework::Vocab
+    DenseTensor* dense_tesnor = v.GetMutable<DenseTensor>();
+    framework::DeserializeFromStream(fin, dense_tesnor, dev_ctx);
+    res[*it] = v;
+  }
+  return res;
+}
+
+framework::ProgramDesc Deserializer::LoadProgram(const std::string& file_name) {
+  VLOG(3) << "LoadProgram " << file_name;
+  std::ifstream fin(file_name, std::ios::in | std::ios::binary);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+  return framework::ProgramDesc(buffer);
+}
+
+Layer Load(const std::string& file_path) {
+  auto deserializer = Deserializer();
+  return deserializer(file_path);
+}
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/serializer.h b/paddle/fluid/jit/serializer.h
new file mode 100644
index 0000000000000..4036c5add7b0b
--- /dev/null
+++ b/paddle/fluid/jit/serializer.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dirent.h>
+
+#include <algorithm>
+#include <fstream>
+#include <set>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/jit/layer.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace paddle {
+namespace jit {
+static const char PDMODEL_SUFFIX[] = ".pdmodel";
+static const char PDPARAMS_SUFFIX[] = ".pdiparams";
+
+// Export Layer into local disk
+class Serializer {
+ public:
+  void operator()(const Layer& layer, const std::string& file_dir);
+
+  //  private:
+  //   void WriteTensorData(const Layer& layer, const std::string& file_name)
+  //   const;
+  //   void WriteExtraInfo(const Layer& layer, const std::string& file_name)
+  //   const;
+  //   void WriteByteCode(const Layer& layer, const std::string& file_name)
+  //   const;
+};
+
+class Deserializer {
+ public:
+  Layer operator()(const std::string& dir_path);
+
+ private:
+  bool IsPersistable(framework::VarDesc* desc_ptr);
+
+  bool EndsWith(const std::string& str, const std::string& suffix);
+
+  const std::vector<std::pair<std::string, std::string>>
+  GetPdmodelFileNamePrefix(const std::string& path);
+
+  VariableNameMap ReadTensorData(const std::string& file_name,
+                                 const std::set<std::string>& var_name) const;
+
+  // void ReadExtraInfo(const std::string& file_name) const;
+  // void ReadByteCode(const std::string& file_name) const;
+
+  framework::ProgramDesc LoadProgram(const std::string& file_name);
+};
+
+void Export(const Layer& layer, const std::string& file_path);
+
+Layer Load(const std::string& file_path);
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index 452f388f03dcf..ff44fa0b77201 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -100,19 +100,28 @@ XPUOpMap& get_kp_ops() {
       {"equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
       {"not_equal", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace())})},
       // reduce op
-      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_min", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
-      {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace())})},
+      // {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
+      // {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
+      // {"reduce_min", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
+      // {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
+      // {"reduce_prod", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
+      // {"reduce_all", XPUKernelSet({pOpKernelType(vartype::BOOL,
+      // XPUPlace())})},
+      // {"reduce_any", XPUKernelSet({pOpKernelType(vartype::BOOL,
+      // XPUPlace())})},
+      // {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
+      // {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32,
+      // XPUPlace())})},
       {"pull_box_sparse",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"push_box_sparse",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_amax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reduce_amin", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"c_sync_calc_stream",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"c_sync_comm_stream",
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 72d343692df73..1a9ff2e6694ea 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <cstdio>
 #include <ctime>
 #include <limits>
+#include <regex>
 
 #include "glog/logging.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -125,22 +126,26 @@ void ChromeTracingLogger::LogMemTraceEventNode(
       std::string(
           R"JSON(
   { 
-    "name": "[memory]", "pid": %lld, "tid": "%lld",
+    "name": "[memory]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, 
     "ph": "i", "cat": "%s", 
     "args": {
       "place": "%s",
       "addr": "%llu",
+      "increase_bytes": %lld,
       "current_allocated": %llu,
       "current_reserved": %llu,
-      "increase_bytes": %lld
+      "peak_allocated": %llu,
+      "peak_reserved": %llu
     }
   },
   )JSON"),
-      mem_node.ProcessId(), mem_node.ThreadId(), mem_node.TimeStampNs(),
+      mem_node.ProcessId(), mem_node.ThreadId(), nsToUs(mem_node.TimeStampNs()),
       StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(),
-      mem_node.Addr(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(),
-      mem_node.IncreaseBytes());
+      mem_node.Addr(), mem_node.IncreaseBytes(), mem_node.CurrentAllocated(),
+      mem_node.CurrentReserved(), mem_node.PeakAllocated(),
+      mem_node.PeakReserved());
+  pid_tid_set_.insert({mem_node.ProcessId(), mem_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogHostTraceEventNode(
@@ -164,6 +169,8 @@ void ChromeTracingLogger::LogHostTraceEventNode(
     input_shapes = op_supplement_node->InputShapes();
     input_dtypes = op_supplement_node->Dtypes();
     callstack = op_supplement_node->CallStack();
+    callstack = std::regex_replace(callstack, std::regex("\""), "\'");
+    callstack = std::regex_replace(callstack, std::regex("\n"), "\\n");
   }
   switch (host_node.Type()) {
     case TracerEventType::ProfileStep:
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index 65f5e81238bc8..e5de858e15c76 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -209,6 +209,8 @@ MemTraceEventNode* DeserializationReader::RestoreMemTraceEventNode(
   mem_event.place = mem_event_proto.place();
   mem_event.current_allocated = mem_event_proto.current_allocated();
   mem_event.current_reserved = mem_event_proto.current_reserved();
+  mem_event.peak_allocated = mem_event_proto.peak_allocated();
+  mem_event.peak_reserved = mem_event_proto.peak_reserved();
   return new MemTraceEventNode(mem_event);
 }
 
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
index 0f0c9c92c9c93..4ebfb6e73b331 100644
--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -51,10 +51,14 @@ enum TracerEventTypeProto {
 };
 
 enum TracerMemEventTypeProto {
-  // Used to mark memory allocation
+  // Used to mark memory allocation which is managed by paddle
   Allocate = 0;
-  // Used to mark memory free
+  // Used to mark memory free which is managed by paddle
   Free = 1;
+  // Used to mark reserved memory allocation which is applied from device.
+  ReservedAllocate = 2;
+  // Used to mark reserved memory free which is released to device.
+  ReservedFree = 3;
 };
 
 message KernelEventInfoProto {
@@ -150,6 +154,10 @@ message MemTraceEventProto {
   required uint64 current_allocated = 8;
   // current total reserved memory
   required uint64 current_reserved = 9;
+  // current peak allocated memory
+  required uint64 peak_allocated = 10;
+  // current peak reserved memory
+  required uint64 peak_reserved = 11;
 }
 
 message OperatorSupplementEventProto {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index eaf1353168ea4..7b1c5bdaa41bc 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -130,6 +130,8 @@ void SerializationLogger::LogMemTraceEventNode(
   mem_trace_event->set_place(mem_node.Place());
   mem_trace_event->set_current_allocated(mem_node.CurrentAllocated());
   mem_trace_event->set_current_reserved(mem_node.CurrentReserved());
+  mem_trace_event->set_peak_allocated(mem_node.PeakAllocated());
+  mem_trace_event->set_peak_reserved(mem_node.PeakReserved());
   current_mem_trace_event_node_proto_->set_allocated_mem_event(mem_trace_event);
 }
 
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index dc6a6bf32d6e3..0a3bda1c34518 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -53,9 +53,9 @@ TEST(SerializationLoggerTest, dump_case0) {
       std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
   mem_events.push_back(MemTraceEvent(11500, 0x1000,
                                      TracerMemEventType::Allocate, 10, 10, 50,
-                                     "GPU:0", 50, 50));
+                                     "GPU:0", 50, 50, 100, 100));
   mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
-                                     10, 10, -50, "GPU:0", 0, 50));
+                                     10, 10, -50, "GPU:0", 0, 50, 100, 100));
   std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
   std::map<std::string, std::vector<std::string>> dtypes;
   input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h
index acd5a03109f72..3ffa9241e9bfb 100644
--- a/paddle/fluid/platform/profiler/event_node.h
+++ b/paddle/fluid/platform/profiler/event_node.h
@@ -47,6 +47,8 @@ class MemTraceEventNode {
   std::string Place() const { return mem_event_.place; }
   uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
   uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
+  uint64_t PeakAllocated() const { return mem_event_.peak_allocated; }
+  uint64_t PeakReserved() const { return mem_event_.peak_reserved; }
 
   // member function
   void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 4e40e87bbbf20..162bf5da642b4 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -93,6 +93,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
     mem_python_node->place = (*memnode)->Place();
     mem_python_node->current_allocated = (*memnode)->CurrentAllocated();
     mem_python_node->current_reserved = (*memnode)->CurrentReserved();
+    mem_python_node->peak_allocated = (*memnode)->PeakAllocated();
+    mem_python_node->peak_reserved = (*memnode)->PeakReserved();
     host_python_node->mem_node_ptrs.push_back(mem_python_node);
   }
   // copy OperatorSupplementEventNode's information if exists
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index 4d1f5ad4f788e..9c5ac28f36f5b 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -66,6 +66,10 @@ struct MemPythonNode {
   uint64_t current_allocated;
   // current total reserved memory
   uint64_t current_reserved;
+  // peak  allocated memory
+  uint64_t peak_allocated;
+  // peak  reserved memory
+  uint64_t peak_reserved;
 };
 
 struct HostPythonNode {
diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc
index b70034633ae66..3f825ce63cd83 100644
--- a/paddle/fluid/platform/profiler/test_event_node.cc
+++ b/paddle/fluid/platform/profiler/test_event_node.cc
@@ -50,9 +50,9 @@ TEST(NodeTreesTest, LogMe_case0) {
       std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
   mem_events.push_back(MemTraceEvent(11500, 0x1000,
                                      TracerMemEventType::Allocate, 10, 10, 50,
-                                     "GPU:0", 50, 50));
+                                     "GPU:0", 50, 50, 100, 100));
   mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
-                                     10, 10, -50, "GPU:0", 0, 50));
+                                     10, 10, -50, "GPU:0", 0, 50, 100, 100));
   std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
   std::map<std::string, std::vector<std::string>> dtypes;
   input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
@@ -185,9 +185,9 @@ TEST(NodeTreesTest, HandleTrees_case0) {
       std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11));
   mem_events.push_back(MemTraceEvent(11500, 0x1000,
                                      TracerMemEventType::Allocate, 10, 10, 50,
-                                     "GPU:0", 50, 50));
+                                     "GPU:0", 50, 50, 100, 100));
   mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
-                                     10, 10, -50, "GPU:0", 0, 50));
+                                     10, 10, -50, "GPU:0", 0, 50, 100, 100));
   op_supplement_events.push_back(OperatorSupplementEvent(
       11600, "op1", std::map<std::string, std::vector<std::vector<int64_t>>>(),
       std::map<std::string, std::vector<std::string>>(), "op1()", 10, 10));
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index bfa000e2683de..b2504a5ec458d 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -59,10 +59,14 @@ enum class TracerEventType {
 };
 
 enum class TracerMemEventType {
-  // Used to mark memory allocation
+  // Used to mark memory allocation which is managed by paddle
   Allocate = 0,
-  // Used to mark memory free
+  // Used to mark memory free which is managed by paddle
   Free = 1,
+  // Used to mark reserved memory allocation which is applied from device.
+  ReservedAllocate = 2,
+  // Used to mark reserved memory free which is released to device.
+  ReservedFree = 3,
   // A flag to denote the number of current types
   NumTypes
 };
@@ -288,7 +292,8 @@ struct MemTraceEvent {
   MemTraceEvent(uint64_t timestamp_ns, uint64_t addr, TracerMemEventType type,
                 uint64_t process_id, uint64_t thread_id, int64_t increase_bytes,
                 const std::string& place, uint64_t current_allocated,
-                uint64_t current_reserved)
+                uint64_t current_reserved, uint64_t peak_allocated,
+                uint64_t peak_reserved)
       : timestamp_ns(timestamp_ns),
         addr(addr),
         type(type),
@@ -297,7 +302,9 @@ struct MemTraceEvent {
         increase_bytes(increase_bytes),
         place(place),
         current_allocated(current_allocated),
-        current_reserved(current_reserved) {}
+        current_reserved(current_reserved),
+        peak_allocated(peak_allocated),
+        peak_reserved(peak_reserved) {}
 
   // timestamp of the record
   uint64_t timestamp_ns;
@@ -318,6 +325,10 @@ struct MemTraceEvent {
   uint64_t current_allocated;
   // current total reserved memory
   uint64_t current_reserved;
+  // current peak allocated memory
+  uint64_t peak_allocated;
+  // current peak reserved memory
+  uint64_t peak_reserved;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index 1f8e113fdd914..446fa49eefbd1 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -83,7 +83,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
 #endif
 
 const char* StringTracerMemEventType(TracerMemEventType type) {
-  static const char* categary_name_[] = {"Allocate", "Free"};
+  static const char* categary_name_[] = {"Allocate", "Free", "ReservedAllocate",
+                                         "ReservedFree"};
   return categary_name_[static_cast<int>(type)];
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 20460c78d2867..e4d4bf1a1c441 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -123,6 +123,10 @@ set(PYBIND_SRCS
     communication.cc
     cuda_streams_py.cc)
 
+if(WITH_CUSTOM_DEVICE)
+  set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
+endif()
+
 if(NOT ON_INFER)
   set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if(WITH_NCCL)
@@ -491,7 +495,7 @@ if(WITH_PYTHON)
   cc_library(
     paddle_pybind SHARED
     SRCS ${PYBIND_SRCS}
-    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${GLOB_DEV_LIB})
+    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index 25f2c91002844..ea404b4f51e78 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -221,8 +221,8 @@ void BindGraphPyClient(py::module* m) {
              auto feats =
                  self.get_node_feat(node_type, node_ids, feature_names);
              std::vector<std::vector<py::bytes>> bytes_feats(feats.size());
-             for (int i = 0; i < feats.size(); ++i) {
-               for (int j = 0; j < feats[i].size(); ++j) {
+             for (size_t i = 0; i < feats.size(); ++i) {
+               for (size_t j = 0; j < feats[i].size(); ++j) {
                  bytes_feats[i].push_back(py::bytes(feats[i][j]));
                }
              }
@@ -234,8 +234,8 @@ void BindGraphPyClient(py::module* m) {
               std::vector<std::string> feature_names,
               std::vector<std::vector<py::bytes>> bytes_feats) {
              std::vector<std::vector<std::string>> feats(bytes_feats.size());
-             for (int i = 0; i < bytes_feats.size(); ++i) {
-               for (int j = 0; j < bytes_feats[i].size(); ++j) {
+             for (size_t i = 0; i < bytes_feats.size(); ++i) {
+               for (size_t j = 0; j < bytes_feats[i].size(); ++j) {
                  feats[i].push_back(std::string(bytes_feats[i][j]));
                }
              }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 3de6c64617ddd..354ac0aef9f2d 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1535,40 +1535,40 @@ void BindImperative(py::module *m_ptr) {
                 "Cannot copy this Tensor to GPU in CPU version Paddle, "
                 "Please recompile or reinstall Paddle with CUDA support."));
 #else
-             int device_count = platform::GetGPUDeviceCount();
-             int device_id = 0;
-             if (handle == py::none()) {
-               if (platform::is_gpu_place(self->Place())) {
-                 return self;
-               }
-             } else {
-               PyObject *py_obj = handle.ptr();
-               PADDLE_ENFORCE_EQ(
-                   PyCheckInteger(py_obj), true,
-                   platform::errors::InvalidArgument(
-                       " 'device_id' must be a positive integer"));
-               device_id = py::cast<int>(handle);
-             }
-             PADDLE_ENFORCE_GE(
-                 device_id, 0,
-                 platform::errors::InvalidArgument(
-                     "Can not copy Tensor to Invalid CUDAPlace(%d), device id "
-                     "must inside [0, %d)",
-                     device_id, device_count));
-             PADDLE_ENFORCE_LT(
-                 device_id, device_count,
-                 platform::errors::InvalidArgument(
-                     "Can not copy Tensor to Invalid CUDAPlace(%d), device id "
-                     "must inside [0, %d)",
-                     device_id, device_count));
-             platform::CUDAPlace place = platform::CUDAPlace(device_id);
-             if (platform::is_same_place(self->Place(), place)) {
-               return self;
-             } else {
-               auto new_var = self->NewVarBase(place, blocking);
-               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
-               return new_var;
-             }
+            int device_count = platform::GetGPUDeviceCount();
+            int device_id = 0;
+            if (handle == py::none()) {
+              auto default_place =
+                  imperative::GetCurrentTracer()->ExpectedPlace();
+              device_id = default_place.GetDeviceId();
+            } else {
+              PyObject *py_obj = handle.ptr();
+              PADDLE_ENFORCE_EQ(
+                  PyCheckInteger(py_obj), true,
+                  platform::errors::InvalidArgument(
+                      " 'device_id' must be a positive integer"));
+              device_id = py::cast<int>(handle);
+            }
+            PADDLE_ENFORCE_GE(
+                device_id, 0,
+                platform::errors::InvalidArgument(
+                    "Can not copy Tensor to Invalid CUDAPlace(%d), device id "
+                    "must inside [0, %d)",
+                    device_id, device_count));
+            PADDLE_ENFORCE_LT(
+                device_id, device_count,
+                platform::errors::InvalidArgument(
+                    "Can not copy Tensor to Invalid CUDAPlace(%d), device id "
+                    "must inside [0, %d)",
+                    device_id, device_count));
+            platform::CUDAPlace place = platform::CUDAPlace(device_id);
+            if (platform::is_same_place(self->Place(), place)) {
+              return self;
+            } else {
+              auto new_var = self->NewVarBase(place, blocking);
+              new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+              return new_var;
+            }
 #endif
           },
           py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
@@ -1588,16 +1588,17 @@ void BindImperative(py::module *m_ptr) {
               # required: gpu
               import paddle
               x = paddle.to_tensor(1.0, place=paddle.CPUPlace())
-              print(x.place)        # CPUPlace
+              print(x.place)        # Place(cpu)
 
               y = x.cuda()
-              print(y.place)        # CUDAPlace(0)
+              print(y.place)        # Place(gpu:0)
             
               y = x.cuda(None)
-              print(y.place)        # CUDAPlace(0)
+              print(y.place)        # Place(gpu:0)
 
-              y = x.cuda(1)
-              print(y.place)        # CUDAPlace(1)
+              paddle.device.set_device("gpu:1")
+              y = x.cuda(None)
+              print(y.place)        # Place(gpu:1)
        )DOC")
       .def(
           "_share_memory",
@@ -1734,6 +1735,17 @@ void BindImperative(py::module *m_ptr) {
             return new_var;
           },
           py::return_value_policy::copy)
+      .def(
+          "_copy_to",
+          [](const std::shared_ptr<imperative::VarBase> &self,
+             const platform::CustomPlace &place, bool blocking) {
+            auto new_var = self->NewVarBase(place, blocking);
+            if (!blocking) {
+              IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+            }
+            return new_var;
+          },
+          py::return_value_policy::copy)
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cba7d03623516..b81f494f1a7df 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -147,6 +147,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
 
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/phi/capi/capi.h"
+#endif
+
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 
 #ifdef PADDLE_WITH_IPU
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index e20db18ea3f53..9715fd770422a 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -21,6 +21,10 @@ add_subdirectory(ops)
 add_subdirectory(tools)
 # phi tests
 add_subdirectory(tests)
+# phi capi
+if(WITH_CUSTOM_DEVICE)
+  add_subdirectory(capi)
+endif()
 
 # make an unity target for compile deps
 set(PHI_DEPS
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index df757b286a6b1..541acd9ecafd0 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -348,7 +348,8 @@ class CustomDevice : public DeviceInterface {
       }
     } else {
       if (!pimpl_->memory_copy_p2p) {
-        std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+        std::unique_ptr<uint8_t> tmp(
+            reinterpret_cast<uint8_t*>(new uint8_t[size]));
         MemoryCopyD2H(src_dev_id, tmp.get(), src, size);
         MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
       } else {
@@ -440,7 +441,8 @@ class CustomDevice : public DeviceInterface {
       PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
           pimpl_->device_memory_set(device, ptr, value, size));
     } else {
-      std::unique_ptr<uint8_t> tmp(new uint8_t[size]);
+      std::unique_ptr<uint8_t> tmp(
+          reinterpret_cast<uint8_t*>(new uint8_t[size]));
       memset(tmp.get(), value, size);
       MemoryCopyH2D(dev_id, ptr, tmp.get(), size);
     }
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index ff58f4f35fd32..77c9ee61858c1 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -25,6 +25,33 @@ extern "C" {
 #define PADDLE_CUSTOM_RUNTIME_MINOR_VERSION 1
 #define PADDLE_CUSTOM_RUNTIME_PATCH_VERSION 1
 
+typedef enum {
+  UNDEFINED = 0,
+  BOOL,
+  UINT8,
+  UINT16,
+  UINT32,
+  UINT64,
+  INT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  FLOAT32,
+  FLOAT64,
+  BFLOAT16,
+} C_DataType;
+
+typedef enum {
+  ANY = 0,
+  NHWC,
+  NCHW,
+  NCDHW,
+  NDHWC,
+  NUM_DATA_LAYOUTS,
+  ALL_LAYOUT = ANY,
+} C_DataLayout;
+
 typedef enum {
   C_SUCCESS = 0,    // success
   C_WARNING,        // results may not meet expectation (such as an asynchronous
diff --git a/paddle/phi/capi/CMakeLists.txt b/paddle/phi/capi/CMakeLists.txt
new file mode 100644
index 0000000000000..c00c38cfa3a8a
--- /dev/null
+++ b/paddle/phi/capi/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_subdirectory(lib)
+cc_library(
+  phi_capi
+  SRCS all.cc
+  DEPS phi_c_data_type
+       phi_c_device_context
+       phi_c_int_array
+       phi_c_kernel_context
+       phi_c_kernel_factory
+       phi_c_kernel_registry
+       phi_c_place
+       phi_c_scalar
+       phi_c_tensor)
diff --git a/paddle/phi/capi/all.cc b/paddle/phi/capi/all.cc
new file mode 100644
index 0000000000000..3d9c9315b3136
--- /dev/null
+++ b/paddle/phi/capi/all.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/capi/all.h"
+
+namespace paddle {
+namespace capi {}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/phi/capi/all.h b/paddle/phi/capi/all.h
new file mode 100644
index 0000000000000..5bd31cafdf977
--- /dev/null
+++ b/paddle/phi/capi/all.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_device_context.h"
+#include "paddle/phi/capi/include/c_int_array.h"
+#include "paddle/phi/capi/include/c_kernel_context.h"
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+#include "paddle/phi/capi/include/c_kernel_registry.h"
+#include "paddle/phi/capi/include/c_place.h"
+#include "paddle/phi/capi/include/c_scalar.h"
+#include "paddle/phi/capi/include/c_tensor.h"
+#include "paddle/phi/capi/include/data_type.h"
+#include "paddle/phi/capi/include/kernel_registry.h"
+
+#endif
diff --git a/paddle/phi/capi/capi.h b/paddle/phi/capi/capi.h
new file mode 100644
index 0000000000000..f8e5a90ddf883
--- /dev/null
+++ b/paddle/phi/capi/capi.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/common.h"
+
+PD_DECLARE_CAPI(data_type);
+PD_DECLARE_CAPI(device_context);
+PD_DECLARE_CAPI(int_array);
+PD_DECLARE_CAPI(kernel_context);
+PD_DECLARE_CAPI(kernel_factory);
+PD_DECLARE_CAPI(kernel_registry);
+PD_DECLARE_CAPI(place);
+PD_DECLARE_CAPI(scalar);
+PD_DECLARE_CAPI(tensor);
+
+#endif
diff --git a/paddle/phi/capi/include/c_data_type.h b/paddle/phi/capi/include/c_data_type.h
new file mode 100644
index 0000000000000..e33d04705206c
--- /dev/null
+++ b/paddle/phi/capi/include/c_data_type.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include <cstdint>
+
+#include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef C_Status PD_Status;
+
+typedef C_DataType PD_DataType;
+
+typedef C_DataLayout PD_DataLayout;
+
+typedef struct {
+  size_t size;
+  void *data;
+} PD_List;
+
+void PD_DeletePointerList(PD_List list);
+
+void PD_DeleteUInt8List(PD_List list);
+
+void PD_DeleteInt64List(PD_List list);
+
+void PD_DeleteInt32List(PD_List list);
+
+void PD_DeleteFloat64List(PD_List list);
+
+void PD_DeleteFloat32List(PD_List list);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_device_context.h b/paddle/phi/capi/include/c_device_context.h
new file mode 100644
index 0000000000000..68621d58ad9d5
--- /dev/null
+++ b/paddle/phi/capi/include/c_device_context.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_tensor.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_DeviceContext PD_DeviceContext;
+
+typedef C_Stream PD_Stream;
+
+PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext *ctx,
+                                    PD_Status *status);
+
+void *PD_DeviceContextAllocateTensor(const PD_DeviceContext *ctx,
+                                     PD_Tensor *tensor,
+                                     size_t size,
+                                     PD_DataType dtype,
+                                     PD_Status *status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_int_array.h b/paddle/phi/capi/include/c_int_array.h
new file mode 100644
index 0000000000000..dbc13b3abea4f
--- /dev/null
+++ b/paddle/phi/capi/include/c_int_array.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_IntArray PD_IntArray;
+
+PD_List PD_IntArrayGetDataPointer(PD_IntArray *int_array);
+
+size_t PD_IntArrayGetElementCount(PD_IntArray *int_array);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_kernel_context.h b/paddle/phi/capi/include/c_kernel_context.h
new file mode 100644
index 0000000000000..c06cb3cd30086
--- /dev/null
+++ b/paddle/phi/capi/include/c_kernel_context.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_device_context.h"
+#include "paddle/phi/capi/include/c_int_array.h"
+#include "paddle/phi/capi/include/c_place.h"
+#include "paddle/phi/capi/include/c_scalar.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_KernelContext PD_KernelContext;
+
+/**
+ * KernelContext
+ */
+
+PD_DeviceContext *PD_KernelContextGetDeviceContext(PD_KernelContext *ctx);
+
+PD_Tensor *PD_KernelContextInputAt(PD_KernelContext *ctx, size_t index);
+
+// PD_Tensor *PD_KernelContextOptionalInputAt(PD_KernelContext *ctx, size_t
+// index);
+
+PD_List PD_KernelContextMultiInputAt(PD_KernelContext *ctx, size_t index);
+
+PD_Tensor *PD_KernelContextOutputAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextMultiOutputAt(PD_KernelContext *ctx, size_t index);
+
+/**
+ * Attribute
+ */
+
+bool PD_KernelContextBoolAttrAt(PD_KernelContext *ctx, size_t index);
+
+int32_t PD_KernelContextInt32AttrAt(PD_KernelContext *ctx, size_t index);
+
+int64_t PD_KernelContextInt64AttrAt(PD_KernelContext *ctx, size_t index);
+
+float PD_KernelContextFloatAttrAt(PD_KernelContext *ctx, size_t index);
+
+double PD_KernelContextDoubleAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_Scalar *PD_KernelContextScalarAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_IntArray *PD_KernelContextIntArrayAttrAt(PD_KernelContext *ctx,
+                                            size_t index);
+
+PD_DataType PD_KernelContextDataTypeAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_DataLayout PD_KernelContextDataLayoutAttrAt(PD_KernelContext *ctx,
+                                               size_t index);
+
+char *PD_KernelContextStringAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListBoolAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListInt32AttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListInt64AttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListFloatAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListDoubleAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListStringAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_List PD_KernelContextListScalarAttrAt(PD_KernelContext *ctx, size_t index);
+
+PD_Place *PD_KernelContextPlaceAttrAt(PD_KernelContext *ctx, size_t index);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_kernel_factory.h b/paddle/phi/capi/include/c_kernel_factory.h
new file mode 100644
index 0000000000000..f84f16ba52011
--- /dev/null
+++ b/paddle/phi/capi/include/c_kernel_factory.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_KernelKey PD_KernelKey;
+
+typedef struct PD_Kernel PD_Kernel;
+
+typedef struct PD_KernelArgsDef PD_KernelArgsDef;
+
+typedef struct PD_TensorArgDef PD_TensorArgDef;
+
+/**
+ * TensorArgDef
+ */
+
+void PD_TensorArgDefSetDataLayout(PD_TensorArgDef *def,
+                                  PD_DataLayout layout,
+                                  PD_Status *status);
+
+void PD_TensorArgDefSetDataType(PD_TensorArgDef *def,
+                                PD_DataType dtype,
+                                PD_Status *status);
+
+/**
+ * KernelArgsDef
+ */
+
+PD_List PD_KernelArgsDefGetInputArgDefs(PD_KernelArgsDef *def,
+                                        PD_Status *status);
+
+PD_List PD_KernelArgsDefGetOutputArgDefs(PD_KernelArgsDef *def,
+                                         PD_Status *status);
+
+/**
+ * KernelKey
+ */
+
+PD_DataLayout PD_KernelKeyGetLayout(PD_KernelKey *key, PD_Status *status);
+
+PD_DataType PD_KernelKeyGetDataType(PD_KernelKey *key, PD_Status *status);
+
+/**
+ * Kernel
+ */
+
+PD_KernelArgsDef *PD_KernelGetArgsDef(PD_Kernel *kernel, PD_Status *status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_kernel_registry.h b/paddle/phi/capi/include/c_kernel_registry.h
new file mode 100644
index 0000000000000..04990be436be9
--- /dev/null
+++ b/paddle/phi/capi/include/c_kernel_registry.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include <vector>
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/capi/include/c_kernel_context.h"
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  PD_ARG_TYPE_CONTEXT = 0,
+  PD_ARG_TYPE_TENSOR,
+  PD_ARG_TYPE_BOOL,
+  PD_ARG_TYPE_BFLOAT16,
+  PD_ARG_TYPE_FLOAT16,
+  PD_ARG_TYPE_FLOAT32,
+  PD_ARG_TYPE_FLOAT64,
+  PD_ARG_TYPE_INT32,
+  PD_ARG_TYPE_INT64,
+  PD_ARG_TYPE_STRING,
+  PD_ARG_TYPE_SCALAR,
+  PD_ARG_TYPE_INT_ARRAY,
+  PD_ARG_TYPE_DATA_TYPE,
+  PD_ARG_TYPE_DATA_LAYOUT,
+  PD_ARG_TYPE_PLACE,
+  PD_ARG_TYPE_LIST_BOOL,
+  PD_ARG_TYPE_LIST_INT32,
+  PD_ARG_TYPE_LIST_INT64,
+  PD_ARG_TYPE_LIST_BFLOAT16,
+  PD_ARG_TYPE_LIST_FLOAT16,
+  PD_ARG_TYPE_LIST_FLOAT32,
+  PD_ARG_TYPE_LIST_FLOAT64,
+  PD_ARG_TYPE_LIST_STRING,
+  PD_ARG_TYPE_LIST_SCALAR,
+  PD_ARG_TYPE_OPTIONAL_TENSOR,
+  PD_ARG_TYPE_LIST_TENSOR,
+  PD_ARG_TYPE_OPTIONAL_MULTI_TENSOR,
+} PD_KernelArgumentType;
+
+void PD_RegisterPhiKernel(const char *kernel_name_cstr,
+                          const char *backend_cstr,
+                          PD_DataType pd_dtype,
+                          PD_DataLayout pd_layout,
+                          size_t in_nargs,
+                          PD_KernelArgumentType *in_args_type,
+                          size_t attr_nargs,
+                          PD_KernelArgumentType *attr_args_type,
+                          size_t out_nargs,
+                          PD_KernelArgumentType *out_args_type,
+                          void (*args_def_fn)(const PD_KernelKey *,
+                                              PD_Kernel *),
+                          void (*fn)(PD_KernelContext *),
+                          void *variadic_kernel_fn);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_place.h b/paddle/phi/capi/include/c_place.h
new file mode 100644
index 0000000000000..bbdc45cbe8d46
--- /dev/null
+++ b/paddle/phi/capi/include/c_place.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_Place PD_Place;
+
+bool PD_PlaceIsHost(PD_Place *place);
+
+int8_t PD_PlaceGetDeviceId(PD_Place *place);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_scalar.h b/paddle/phi/capi/include/c_scalar.h
new file mode 100644
index 0000000000000..3ea3c3fc12c65
--- /dev/null
+++ b/paddle/phi/capi/include/c_scalar.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_Scalar PD_Scalar;
+
+bool PD_ScalarGetBoolData(PD_Scalar *scalar);
+
+int8_t PD_ScalarGetInt8Data(PD_Scalar *scalar);
+
+int16_t PD_ScalarGetInt16Data(PD_Scalar *scalar);
+
+int32_t PD_ScalarGetInt32Data(PD_Scalar *scalar);
+
+int64_t PD_ScalarGetInt64Data(PD_Scalar *scalar);
+
+uint8_t PD_ScalarGetUInt8Data(PD_Scalar *scalar);
+
+uint16_t PD_ScalarGetUInt16Data(PD_Scalar *scalar);
+
+uint32_t PD_ScalarGetUInt32Data(PD_Scalar *scalar);
+
+uint64_t PD_ScalarGetUInt64Data(PD_Scalar *scalar);
+
+float PD_ScalarGetFloat32Data(PD_Scalar *scalar);
+
+double PD_ScalarGetFloat64Data(PD_Scalar *scalar);
+
+PD_DataType PD_ScalarGetDataType(PD_Scalar *scalar);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h
new file mode 100644
index 0000000000000..494346713cf53
--- /dev/null
+++ b/paddle/phi/capi/include/c_tensor.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PD_Tensor PD_Tensor;
+
+PD_DataType PD_TensorGetDataType(const PD_Tensor *tensor, PD_Status *status);
+
+PD_DataLayout PD_TensorGetDataLayout(const PD_Tensor *tensor,
+                                     PD_Status *status);
+
+int64_t PD_TensorGetByteSize(const PD_Tensor *tensor, PD_Status *status);
+
+void *PD_TensorGetDataPointer(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetElementCount(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetNumDims(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetDim(const PD_Tensor *tensor,
+                        size_t index,
+                        PD_Status *status);
+
+void PD_TensorGetLoD(const PD_Tensor *tensor,
+                     PD_List *data,
+                     PD_List *offset,
+                     PD_Status *status);
+
+bool PD_TensorIsInitialized(const PD_Tensor *tensor, PD_Status *status);
+
+bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status);
+
+void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status);
+
+void PD_TensorSetDims(PD_Tensor *tensor,
+                      int64_t ndims,
+                      const int64_t *dims,
+                      PD_Status *status);
+
+void PD_TensorSetDataType(PD_Tensor *tensor,
+                          PD_DataType dtype,
+                          PD_Status *status);
+
+void PD_TensorSetDataLayout(PD_Tensor *tensor,
+                            PD_DataLayout layout,
+                            PD_Status *status);
+
+void PD_TensorResetLoD(PD_Tensor *tensor,
+                       PD_List data,
+                       PD_List offset,
+                       PD_Status *status);
+
+PD_Tensor *PD_NewTensor();
+
+void PD_DeleteTensor(PD_Tensor *tensor);
+
+void PD_TensorShareDataWith(PD_Tensor *dst,
+                            const PD_Tensor *src,
+                            PD_Status *status);
+
+void PD_TensorShareLoDWith(PD_Tensor *dst,
+                           const PD_Tensor *src,
+                           PD_Status *status);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif
diff --git a/paddle/phi/capi/include/common.h b/paddle/phi/capi/include/common.h
new file mode 100644
index 0000000000000..2d2bc231f479b
--- /dev/null
+++ b/paddle/phi/capi/include/common.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#define PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+
+#define _PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)  \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#define PD_DECLARE_CAPI(module_name)                             \
+  PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(           \
+      PD_DECLARE_tp_kernel_ns_check_##module_name##_,            \
+      "PD_DECLARE_KERNEL must be called in global namespace.");  \
+  extern int TouchCAPISymbolFor##module_name##_();               \
+  UNUSED static int __declare_capi_symbol_for_##module_name##_ = \
+      TouchCAPISymbolFor##module_name##_()
+
+#define PD_REGISTER_CAPI(module_name)                           \
+  PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(          \
+      PD_DECLARE_tp_kernel_ns_check_##module_name##_,           \
+      "PD_DECLARE_KERNEL must be called in global namespace."); \
+  int TouchCAPISymbolFor##module_name##_() { return 0; }
+
+#endif
diff --git a/paddle/phi/capi/include/data_type.h b/paddle/phi/capi/include/data_type.h
new file mode 100644
index 0000000000000..6acbf026e8cb6
--- /dev/null
+++ b/paddle/phi/capi/include/data_type.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+namespace phi {
+
+namespace capi {
+
+#define CPP_TYPE_TO_PD_DTYPE_REGISTER(_)         \
+  _(bool, PD_DataType::BOOL)                     \
+  _(phi::dtype::bfloat16, PD_DataType::BFLOAT16) \
+  _(phi::dtype::float16, PD_DataType::FLOAT16)   \
+  _(float, PD_DataType::FLOAT32)                 \
+  _(double, PD_DataType::FLOAT64)                \
+  _(uint8_t, PD_DataType::UINT8)                 \
+  _(uint16_t, PD_DataType::UINT16)               \
+  _(uint32_t, PD_DataType::UINT32)               \
+  _(uint64_t, PD_DataType::UINT64)               \
+  _(int8_t, PD_DataType::INT8)                   \
+  _(int16_t, PD_DataType::INT16)                 \
+  _(int32_t, PD_DataType::INT32)                 \
+  _(int64_t, PD_DataType::INT64)
+
+template <typename T>
+struct CppTypeToPDType;
+
+#define CPP_TYPE_TO_PD_DTYPE(x, y)                    \
+  template <>                                         \
+  struct CppTypeToPDType<x> {                         \
+    constexpr static PD_DataType Type() { return y; } \
+  };
+
+template <PD_DataType T>
+struct PDTypeToCppType;
+
+#define PD_DTYPE_TO_CPP_TYPE(x, y) \
+  template <>                      \
+  struct PDTypeToCppType<y> {      \
+    using type = x;                \
+  };
+
+CPP_TYPE_TO_PD_DTYPE_REGISTER(CPP_TYPE_TO_PD_DTYPE)
+CPP_TYPE_TO_PD_DTYPE_REGISTER(PD_DTYPE_TO_CPP_TYPE)
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/include/kernel_registry.h b/paddle/phi/capi/include/kernel_registry.h
new file mode 100644
index 0000000000000..37b045a60658b
--- /dev/null
+++ b/paddle/phi/capi/include/kernel_registry.h
@@ -0,0 +1,338 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/wrapper_base.h"
+
+namespace phi {
+namespace capi {
+
+inline phi::capi::DeviceContext PD_GetDeviceContext(PD_KernelContext *ctx) {
+  return phi::capi::DeviceContext(PD_KernelContextGetDeviceContext(ctx));
+}
+
+inline phi::capi::DenseTensor PD_InputAt(PD_KernelContext *ctx, size_t index) {
+  return phi::capi::DenseTensor(PD_KernelContextInputAt(ctx, index));
+}
+
+inline paddle::optional<phi::capi::DenseTensor> PD_OptionalInputAt(
+    PD_KernelContext *ctx, size_t index) {
+  auto tensor = PD_KernelContextInputAt(ctx, index);
+  return tensor
+             ? paddle::optional<phi::capi::DenseTensor>(phi::capi::DenseTensor(
+                   reinterpret_cast<PD_Tensor *>(tensor)))
+             : paddle::optional<phi::capi::DenseTensor>(paddle::none);
+}
+
+inline std::vector<phi::capi::DenseTensor> PD_MultiInputAt(
+    PD_KernelContext *ctx, size_t index) {
+  std::vector<phi::capi::DenseTensor> ret;
+  auto list = PD_KernelContextMultiInputAt(ctx, index);
+  auto data = reinterpret_cast<PD_Tensor **>(list.data);
+  for (size_t i = 0; i < list.size; ++i) {
+    ret.emplace_back(data[i]);
+  }
+  return ret;
+}
+
+inline phi::capi::DenseTensor PD_OutputAt(PD_KernelContext *ctx, size_t index) {
+  return phi::capi::DenseTensor(PD_KernelContextOutputAt(ctx, index));
+}
+
+inline std::vector<phi::capi::DenseTensor> PD_MultiOutputAt(
+    PD_KernelContext *ctx, size_t index) {
+  std::vector<phi::capi::DenseTensor> ret;
+  auto list = PD_KernelContextMultiOutputAt(ctx, index);
+  auto data = reinterpret_cast<PD_Tensor **>(list.data);
+  for (size_t i = 0; i < list.size; ++i) {
+    ret.emplace_back(data[i]);
+  }
+  return ret;
+}
+
+template <typename T>
+inline std::vector<T *> PD_GetPointerVector(std::vector<T> *vec) {
+  std::vector<T *> ret;
+  for (auto &item : vec) {
+    ret.push_back(&item);
+  }
+  return ret;
+}
+
+template <typename T>
+inline T PD_AttrAt(PD_KernelContext *ctx, size_t index);
+
+template <>
+inline bool PD_AttrAt<bool>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextBoolAttrAt(ctx, index);
+}
+
+template <>
+inline int32_t PD_AttrAt<int32_t>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextInt32AttrAt(ctx, index);
+}
+
+template <>
+inline int64_t PD_AttrAt<int64_t>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextInt64AttrAt(ctx, index);
+}
+
+template <>
+inline float PD_AttrAt<float>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextFloatAttrAt(ctx, index);
+}
+
+template <>
+inline double PD_AttrAt<double>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextDoubleAttrAt(ctx, index);
+}
+
+template <>
+inline std::string PD_AttrAt<std::string>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextStringAttrAt(ctx, index);
+}
+
+template <>
+inline PD_DataType PD_AttrAt<PD_DataType>(PD_KernelContext *ctx, size_t index) {
+  return PD_KernelContextDataTypeAttrAt(ctx, index);
+}
+
+template <>
+inline PD_DataLayout PD_AttrAt<PD_DataLayout>(PD_KernelContext *ctx,
+                                              size_t index) {
+  return PD_KernelContextDataLayoutAttrAt(ctx, index);
+}
+
+template <>
+inline std::vector<int32_t> PD_AttrAt<std::vector<int32_t>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto list = PD_KernelContextListInt32AttrAt(ctx, index);
+  auto data = reinterpret_cast<int32_t *>(list.data);
+  std::vector<int32_t> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<int64_t> PD_AttrAt<std::vector<int64_t>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto list = PD_KernelContextListInt64AttrAt(ctx, index);
+  auto data = reinterpret_cast<int64_t *>(list.data);
+  std::vector<int64_t> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<float> PD_AttrAt<std::vector<float>>(PD_KernelContext *ctx,
+                                                        size_t index) {
+  auto list = PD_KernelContextListFloatAttrAt(ctx, index);
+  auto data = reinterpret_cast<float *>(list.data);
+  std::vector<float> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline std::vector<double> PD_AttrAt<std::vector<double>>(PD_KernelContext *ctx,
+                                                          size_t index) {
+  auto list = PD_KernelContextListDoubleAttrAt(ctx, index);
+  auto data = reinterpret_cast<double *>(list.data);
+  std::vector<double> cc_list(data, data + list.size);
+  return cc_list;
+}
+
+template <>
+inline phi::capi::Scalar PD_AttrAt<phi::capi::Scalar>(PD_KernelContext *ctx,
+                                                      size_t index) {
+  auto scalar = PD_KernelContextScalarAttrAt(ctx, index);
+  return phi::capi::Scalar(scalar);
+}
+
+template <>
+inline phi::capi::IntArray PD_AttrAt<phi::capi::IntArray>(PD_KernelContext *ctx,
+                                                          size_t index) {
+  auto int_array = PD_KernelContextIntArrayAttrAt(ctx, index);
+  return phi::capi::IntArray(int_array);
+}
+
+template <>
+inline phi::capi::Place PD_AttrAt<phi::capi::Place>(PD_KernelContext *ctx,
+                                                    size_t index) {
+  auto place = PD_KernelContextPlaceAttrAt(ctx, index);
+  return phi::capi::Place(place);
+}
+
+template <>
+inline std::vector<phi::capi::Scalar> PD_AttrAt<std::vector<phi::capi::Scalar>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto c_list = PD_KernelContextListScalarAttrAt(ctx, index);
+  auto data = reinterpret_cast<PD_Scalar **>(c_list.data);
+  std::vector<phi::capi::Scalar> list;
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list.emplace_back(data[i]);
+  }
+  PD_DeletePointerList(c_list);
+  return list;
+}
+
+template <>
+inline std::vector<std::string> PD_AttrAt<std::vector<std::string>>(
+    PD_KernelContext *ctx, size_t index) {
+  auto c_list = PD_KernelContextListScalarAttrAt(ctx, index);
+  auto data = reinterpret_cast<char **>(c_list.data);
+  std::vector<std::string> list;
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list.emplace_back(data[i]);
+  }
+  PD_DeletePointerList(c_list);
+  return list;
+}
+
+template <>
+inline std::vector<bool> PD_AttrAt<std::vector<bool>>(PD_KernelContext *ctx,
+                                                      size_t index) {
+  auto c_list = PD_KernelContextListBoolAttrAt(ctx, index);
+  std::vector<bool> list;
+  auto data = reinterpret_cast<uint8_t *>(c_list.data);
+  for (size_t i = 0; i < c_list.size; ++i) {
+    list[i] = static_cast<bool>(data[i]);
+  }
+  PD_DeleteUInt8List(c_list);
+  return list;
+}
+
+#define CPP_TYPE_TO_PD_ARG_TYPE_REGISTER(_)                                 \
+  _(phi::capi::DenseTensor, ::PD_KernelArgumentType::PD_ARG_TYPE_TENSOR)    \
+  _(phi::capi::DeviceContext, ::PD_KernelArgumentType::PD_ARG_TYPE_CONTEXT) \
+  _(bool, ::PD_KernelArgumentType::PD_ARG_TYPE_BOOL)                        \
+  _(float, ::PD_KernelArgumentType::PD_ARG_TYPE_FLOAT32)                    \
+  _(double, ::PD_KernelArgumentType::PD_ARG_TYPE_FLOAT64)                   \
+  _(int32_t, ::PD_KernelArgumentType::PD_ARG_TYPE_INT32)                    \
+  _(int64_t, ::PD_KernelArgumentType::PD_ARG_TYPE_INT64)                    \
+  _(PD_DataType, ::PD_KernelArgumentType::PD_ARG_TYPE_DATA_TYPE)            \
+  _(PD_DataLayout, ::PD_KernelArgumentType::PD_ARG_TYPE_DATA_LAYOUT)        \
+  _(std::vector<int32_t>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT32)  \
+  _(std::vector<int64_t>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT64)  \
+  _(std::vector<float>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT32)  \
+  _(std::vector<double>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT64) \
+  _(std::vector<bool>, ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_BOOL)      \
+  _(std::string, ::PD_KernelArgumentType::PD_ARG_TYPE_STRING)               \
+  _(phi::capi::Scalar, ::PD_KernelArgumentType::PD_ARG_TYPE_SCALAR)         \
+  _(phi::capi::IntArray, ::PD_KernelArgumentType::PD_ARG_TYPE_INT_ARRAY)    \
+  _(phi::capi::Place, ::PD_KernelArgumentType::PD_ARG_TYPE_PLACE)           \
+  _(std::vector<std::string>,                                               \
+    ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_STRING)                       \
+  _(std::vector<phi::capi::Scalar>,                                         \
+    ::PD_KernelArgumentType::PD_ARG_TYPE_LIST_SCALAR)
+
+template <typename T>
+struct CppTypeToPDArgumentType;
+
+#define CPP_TYPE_TO_PD_ARG_TYPE(x, y)                             \
+  template <>                                                     \
+  struct CppTypeToPDArgumentType<x> {                             \
+    constexpr static ::PD_KernelArgumentType Type() { return y; } \
+  };
+
+template <::PD_KernelArgumentType T>
+struct PDArgumentTypeToCppType;
+
+#define PD_ARG_TYPE_TO_CPP_TYPE(x, y) \
+  template <>                         \
+  struct PDArgumentTypeToCppType<y> { \
+    using type = x;                   \
+  };
+
+CPP_TYPE_TO_PD_ARG_TYPE_REGISTER(CPP_TYPE_TO_PD_ARG_TYPE)
+CPP_TYPE_TO_PD_ARG_TYPE_REGISTER(PD_ARG_TYPE_TO_CPP_TYPE)
+
+}  // namespace capi
+
+using LoD = capi::LoD;
+using Context = capi::DeviceContext;
+using DenseTensor = capi::DenseTensor;
+using Scalar = capi::Scalar;
+using IntArray = capi::IntArray;
+using Place = capi::Place;
+using DataType = ::PD_DataType;
+using DataLayout = ::PD_DataLayout;
+
+}  // namespace phi
+
+#include "paddle/phi/capi/include/kernel_utils.h"
+
+// clang-format off
+
+#define PD_BUILD_PHI_KERNEL(kernel_name,                            \
+                            backend,                                \
+                            layout,                                 \
+                            meta_kernel_fn,                         \
+                            ...)                                    \
+  static void                                                       \
+      __CUSTOM_adefs_CFN_##kernel_name##_##backend##_##layout(      \
+          const PD_KernelKey* kernel_key, PD_Kernel* kernel);       \
+  template <typename kernel_type>                                   \
+  struct __##kernel_name##_##backend##_##layout##__ {               \
+    __##kernel_name##_##backend##_##layout##__() {                  \
+      ::phi::capi::CustomKernelArgsParseFunctor<decltype(           \
+          &meta_kernel_fn<kernel_type>)>                            \
+          parser;                                                   \
+      PD_RegisterPhiKernel(                                         \
+          #kernel_name,                                             \
+          #backend,                                                 \
+          ::phi::capi::CppTypeToPDType<kernel_type>::Type(),        \
+          PD_DATALAYOUT(layout),                                    \
+          parser.in_args_type.size(),                               \
+          parser.in_args_type.data(),                               \
+          parser.attr_args_type.size(),                             \
+          parser.attr_args_type.data(),                             \
+          parser.out_args_type.size(),                              \
+          parser.out_args_type.data(),                              \
+          __CUSTOM_adefs_CFN_##kernel_name##_##backend##_##layout,  \
+          CUSTOM_PHI_KERNEL(meta_kernel_fn<kernel_type>),           \
+          CUSTOM_PHI_VARIADIC_KERNEL(                               \
+            meta_kernel_fn<kernel_type>));                          \
+    }                                                               \
+    static void Touch() {}                                          \
+  };                                                                \
+  PD_CUSTOM_PHI_KERNEL_STATIC_ASSERT_GLOBAL_NAMESPACE(              \
+      CUSTOM_tp_ns_check_##kernel_name##_##backend##_##layout,      \
+      "PD_BUILD_KERNEL must be called in global namespace.");       \
+  static void                                                       \
+      __CUSTOM_adefs_FN_##kernel_name##_##backend##_##layout(       \
+          const ::phi::capi::KernelKey &kernel_key,                 \
+          ::phi::capi::Kernel* kernel);                             \
+  _PD_BUILD_PHI_KERNEL(__##kernel_name##_##backend##_##layout##__,  \
+                       kernel_name,                                 \
+                       backend,                                     \
+                       layout,                                      \
+                       meta_kernel_fn,                              \
+                       __VA_ARGS__)                                 \
+  void                                                              \
+      __CUSTOM_adefs_CFN_##kernel_name##_##backend##_##layout(      \
+          const PD_KernelKey* kernel_key, PD_Kernel* kernel) {      \
+          auto cc_kernel = ::phi::capi::Kernel(kernel);             \
+          __CUSTOM_adefs_FN_##kernel_name##_##backend##_##layout(   \
+            ::phi::capi::KernelKey(                                 \
+              const_cast<PD_KernelKey*>(kernel_key)),               \
+            &cc_kernel);                                            \
+      }                                                             \
+  void                                                              \
+      __CUSTOM_adefs_FN_##kernel_name##_##backend##_##layout(       \
+          const ::phi::capi::KernelKey &kernel_key,                 \
+          ::phi::capi::Kernel* kernel)
+
+// clang-format on
+
+#endif
diff --git a/paddle/phi/capi/include/kernel_utils.h b/paddle/phi/capi/include/kernel_utils.h
new file mode 100644
index 0000000000000..7302e6f4677b3
--- /dev/null
+++ b/paddle/phi/capi/include/kernel_utils.h
@@ -0,0 +1,812 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/capi/include/common.h"
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+namespace phi {
+namespace capi {
+
+#define CUSTOM_PHI_KERNEL(...) \
+  ::phi::capi::CustomKernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+
+#define CUSTOM_PHI_VARIADIC_KERNEL(...)                      \
+  reinterpret_cast<void *>(                                  \
+      &::phi::capi::CustomKernelImpl<decltype(&__VA_ARGS__), \
+                                     &__VA_ARGS__>::VariadicCompute)
+
+#define PD_CUSTOM_NARGS(...) \
+  _PD_CUSTOM_NARGS((__VA_ARGS__, _PD_CUSTOM_RESQ_N()))
+#define _PD_CUSTOM_NARGS(...) _PD_CUSTOM_ARG_N(__VA_ARGS__)
+#define _PD_CUSTOM_ARG_N_EXPAND(                                              \
+    _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \
+  N
+#define _PD_CUSTOM_ARG_N(args) _PD_CUSTOM_ARG_N_EXPAND args
+#define _PD_CUSTOM_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define PD_DATALAYOUT(arg__) PD_DataLayout::arg__
+
+#ifdef __COUNTER__
+#define PD_CUSTOM_PHI_KERNEL_ID __COUNTER__
+#else
+#define PD_CUSTOM_PHI_KERNEL_ID __LINE__
+#endif
+
+#define PD_CUSTOM_PHI_KERNEL_CONCATENATE(arg1, arg2) \
+  PD_CUSTOM_PHI_KERNEL_CONCATENATE1(arg1, arg2)
+#define PD_CUSTOM_PHI_KERNEL_CONCATENATE1(arg1, arg2) \
+  PD_CUSTOM_PHI_KERNEL_CONCATENATE2(arg1, arg2)
+#define PD_CUSTOM_PHI_KERNEL_CONCATENATE2(arg1, arg2) arg1##arg2
+#define PD_CUSTOM_PHI_KERNEL_EXPAND(x) x
+
+#define _PD_BUILD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, ...) \
+  PD_CUSTOM_PHI_KERNEL_CONCATENATE(_PD_BUILD_KERNEL_INSTANTIATION_, N)  \
+  (meta_kernel_fn, backend, __VA_ARGS__)
+
+#define _PD_BUILD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype) \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
+#define _PD_BUILD_KERNEL_INSTANTIATION_2(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_3(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_4(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_5(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_6(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_7(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_8(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_9(                                 \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_10(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_11(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_12(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_13(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_14(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PD_BUILD_KERNEL_INSTANTIATION_15(                                \
+    meta_kernel_fn, backend, cpp_dtype, ...)                              \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                            \
+      _PD_BUILD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_1(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype)                          \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  int TouchCustomKernelSymbolFor_##kernel_name##_##backend##_##layout() {     \
+    return 0;                                                                 \
+  }
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_2(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_1(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_3(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_2(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_4(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_3(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_5(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_4(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_6(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_5(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_7(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_6(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_8(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_7(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_9(registrar_class,                    \
+                                          kernel_name,                        \
+                                          backend,                            \
+                                          layout,                             \
+                                          registrar_id,                       \
+                                          meta_kernel_fn,                     \
+                                          cpp_dtype,                          \
+                                          ...)                                \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_8(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_10(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_9(registrar_class,                      \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        layout,                               \
+                                        PD_CUSTOM_PHI_KERNEL_ID,              \
+                                        meta_kernel_fn,                       \
+                                        __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_11(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_10(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_12(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_11(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_13(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_12(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_14(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_13(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT_15(registrar_class,                   \
+                                           kernel_name,                       \
+                                           backend,                           \
+                                           layout,                            \
+                                           registrar_id,                      \
+                                           meta_kernel_fn,                    \
+                                           cpp_dtype,                         \
+                                           ...)                               \
+  static const registrar_class<cpp_dtype> PD_CUSTOM_PHI_KERNEL_CONCATENATE(   \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id); \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                                \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_14(registrar_class,                     \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         layout,                              \
+                                         PD_CUSTOM_PHI_KERNEL_ID,             \
+                                         meta_kernel_fn,                      \
+                                         __VA_ARGS__))
+
+#define _PD_BUILD_KERNEL_REGISTRAR_INIT(                                   \
+    N, registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(PD_CUSTOM_PHI_KERNEL_CONCATENATE(            \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT_, N)(registrar_class,                \
+                                           kernel_name,                    \
+                                           backend,                        \
+                                           layout,                         \
+                                           PD_CUSTOM_PHI_KERNEL_ID,        \
+                                           meta_kernel_fn,                 \
+                                           __VA_ARGS__))
+
+#define PD_BUILD_KERNEL_REGISTRAR_INIT(                                 \
+    registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(                                          \
+      _PD_BUILD_KERNEL_REGISTRAR_INIT(PD_CUSTOM_NARGS(__VA_ARGS__),     \
+                                      registrar_class,                  \
+                                      kernel_name,                      \
+                                      backend,                          \
+                                      layout,                           \
+                                      meta_kernel_fn,                   \
+                                      __VA_ARGS__))
+
+#define PD_BUILD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \
+  _PD_BUILD_KERNEL_INSTANTIATION(                                   \
+      PD_CUSTOM_NARGS(__VA_ARGS__), meta_kernel_fn, backend, __VA_ARGS__)
+
+#define _PD_BUILD_2TA_KERNEL(                                           \
+    registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_BUILD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__);  \
+  PD_BUILD_KERNEL_REGISTRAR_INIT(registrar_class,                       \
+                                 kernel_name,                           \
+                                 backend,                               \
+                                 layout,                                \
+                                 meta_kernel_fn,                        \
+                                 __VA_ARGS__);
+
+#define _PD_BUILD_PHI_KERNEL(                                           \
+    registrar_class, kernel_name, backend, layout, meta_kernel_fn, ...) \
+  PD_CUSTOM_PHI_KERNEL_EXPAND(_PD_BUILD_2TA_KERNEL(registrar_class,     \
+                                                   kernel_name,         \
+                                                   backend,             \
+                                                   layout,              \
+                                                   meta_kernel_fn,      \
+                                                   __VA_ARGS__))
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)     \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<const dev_ctx &, Tail...> {                  \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(in_idx == 0,                                             \
+                    "Kernel's DeviceContext should appear before Inputs.");  \
+      static_assert(                                                         \
+          attr_idx == 0,                                                     \
+          "Kernel's DeviceContext should appear before Attributes.");        \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's DeviceContext should appear before Outputs."); \
+      dev_ctx arg = PD_GetDeviceContext(ctx);                                \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(tensor_type)      \
+  template <typename... Tail>                                            \
+  struct CustomKernelCallHelper<const tensor_type &, Tail...> {          \
+    template <int dev_ctx_idx,                                           \
+              int in_idx,                                                \
+              int attr_idx,                                              \
+              int out_idx,                                               \
+              typename... PreviousArgs>                                  \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \
+      static_assert(attr_idx == 0,                                       \
+                    "Kernel's Input should appear before Attributes.");  \
+      static_assert(out_idx == 0,                                        \
+                    "Kernel's Input should appear before Outputs.");     \
+      const tensor_type arg = PD_InputAt(ctx, in_idx);                   \
+      CustomKernelCallHelper<Tail...>::                                  \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(  \
+              ctx, pargs..., arg);                                       \
+    }                                                                    \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type) \
+  template <typename... Tail>                                                \
+  struct CustomKernelCallHelper<const paddle::optional<tensor_type> &,       \
+                                Tail...> {                                   \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {     \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      auto arg = PD_OptionalInputAt(ctx, in_idx);                            \
+      CustomKernelCallHelper<Tail...>::                                      \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
+  template <typename... Tail>                                             \
+  struct CustomKernelCallHelper<const std::vector<const tensor_type *> &, \
+                                Tail...> {                                \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
+      static_assert(attr_idx == 0,                                        \
+                    "Kernel's Input should appear before Attributes.");   \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Input should appear before Outputs.");      \
+      auto arg = PD_MultiInputAt(ctx, in_idx);                            \
+      auto arg_wrapper = PD_GetPointerVector(&arg);                       \
+      CustomKernelCallHelper<Tail...>::                                   \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(   \
+              ctx, pargs..., arg_wrapper);                                \
+    }                                                                     \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(attr_type)     \
+  template <typename... Tail>                                             \
+  struct CustomKernelCallHelper<attr_type, Tail...> {                     \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Attributes should appear before Outputs."); \
+      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                \
+      CustomKernelCallHelper<Tail...>::                                   \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
+              ctx, pargs..., arg);                                        \
+    }                                                                     \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(     \
+    attr_type)                                                            \
+  template <typename... Tail>                                             \
+  struct CustomKernelCallHelper<const attr_type &, Tail...> {             \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {  \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Attributes should appear before Outputs."); \
+      attr_type arg = PD_AttrAt<attr_type>(ctx, attr_idx);                \
+      CustomKernelCallHelper<Tail...>::                                   \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
+              ctx, pargs..., arg);                                        \
+    }                                                                     \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(tensor_type)     \
+  template <typename... Tail>                                            \
+  struct CustomKernelCallHelper<tensor_type *, Tail...> {                \
+    template <int dev_ctx_idx,                                           \
+              int in_idx,                                                \
+              int attr_idx,                                              \
+              int out_idx,                                               \
+              typename... PreviousArgs>                                  \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \
+      auto arg = PD_OutputAt(ctx, out_idx);                              \
+      tensor_type *ptr = (arg.raw_data() ? &arg : nullptr);              \
+      CustomKernelCallHelper<Tail...>::                                  \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(  \
+              ctx, pargs..., ptr);                                       \
+    }                                                                    \
+  }
+
+#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \
+  template <typename... Tail>                                              \
+  struct CustomKernelCallHelper<std::vector<tensor_type *>, Tail...> {     \
+    template <int dev_ctx_idx,                                             \
+              int in_idx,                                                  \
+              int attr_idx,                                                \
+              int out_idx,                                                 \
+              typename... PreviousArgs>                                    \
+    static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) {   \
+      auto arg = PD_MultiOutputAt(ctx, out_idx);                           \
+      auto arg_wrapper = PD_GetPointerVector(&arg);                        \
+      CustomKernelCallHelper<Tail...>::                                    \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(    \
+              ctx, pargs..., arg_wrapper);                                 \
+    }                                                                      \
+  }
+
+template <typename T>
+struct CustomTypeTag {};
+
+template <typename Fn, Fn fn>
+struct CustomKernelImpl;
+
+template <typename Return,
+          typename DevCtx,
+          typename... Args,
+          Return (*kernel_fn)(DevCtx, Args...)>
+struct CustomKernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
+  static void Compute(PD_KernelContext *ctx) {
+    CustomKernelCallHelper<DevCtx, Args..., CustomTypeTag<int>>::
+        template Compute<0, 0, 0, 0>(ctx);
+  }
+
+  static void VariadicCompute(const phi::capi::DeviceContext &dev_ctx,
+                              Args... args) {
+    return kernel_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct CustomKernelCallHelper;
+
+  /* DeviceContext Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(
+      phi::capi::DeviceContext);
+
+  /* Input Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(phi::capi::DenseTensor);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_OPTIONAL_INPUT(
+      phi::capi::DenseTensor);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(phi::capi::DenseTensor);
+
+  /* Attribute Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(int32_t);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(PD_DataType);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(PD_DataLayout);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(phi::capi::Place);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<bool>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<int32_t>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<int64_t>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<float>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<double>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(std::string);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<std::string>);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      phi::capi::Scalar);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      phi::capi::IntArray);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF(
+      std::vector<phi::capi::Scalar>);
+
+  /* Output Helpers */
+
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(phi::capi::DenseTensor);
+  PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(phi::capi::DenseTensor);
+
+  /* End case */
+  template <typename T>
+  struct CustomKernelCallHelper<CustomTypeTag<T>> {
+    template <int dev_ctx_idx, int in_idx, int attr_idx, int out_idx>
+    static void Compute(PD_KernelContext *ctx, DevCtx dev_ctx, Args &...args) {
+      static_assert(dev_ctx_idx > 0,
+                    "Kernel should pass DeviceContext as argument.");
+      static_assert(out_idx > 0, "Kernel should have output argument.");
+      return kernel_fn(dev_ctx, args...);
+    }
+  };
+};
+
+template <typename Func>
+struct CustomKernelArgsParseFunctor;
+
+template <typename Return_, typename... Args_>
+struct CustomKernelArgsParseFunctor<Return_ (*)(Args_...)> {
+  using Args = std::tuple<Args_...>;
+  enum : std::size_t { Arity = sizeof...(Args_) };
+  using Indices = std::make_index_sequence<Arity>;
+  template <std::size_t Index>
+  using Arg = typename std::tuple_element<Index, Args>::type;
+
+  CustomKernelArgsParseFunctor() {
+    auto args_type = ParseArgType(Indices{});
+
+    for (auto arg_type : args_type) {
+      if (arg_type ==
+          std::type_index(typeid(const phi::capi::DeviceContext *))) {
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::DenseTensor &))) {
+        in_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_TENSOR);
+      } else if (arg_type ==
+                 std::type_index(typeid(
+                     const paddle::optional<phi::capi::DenseTensor> &))) {
+        in_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_TENSOR);
+      } else if (arg_type ==
+                 std::type_index(typeid(
+                     const std::vector<const phi::capi::DenseTensor *> &))) {
+        in_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR);
+      } else if (arg_type ==
+                 std::type_index(
+                     typeid(const paddle::optional<
+                            std::vector<const phi::capi::DenseTensor *>> &))) {
+        in_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_MULTI_TENSOR);
+      } else if (arg_type == std::type_index(typeid(bool))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_BOOL);
+      } else if (arg_type == std::type_index(typeid(float))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_FLOAT32);
+      } else if (arg_type == std::type_index(typeid(double))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_FLOAT64);
+      } else if (arg_type == std::type_index(typeid(int32_t))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_INT32);
+      } else if (arg_type == std::type_index(typeid(int64_t))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_INT64);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::Place &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_PLACE);
+      } else if (arg_type == std::type_index(typeid(const std::string &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_STRING);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<bool> &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_BOOL);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<float> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT32);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<double> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT64);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<int32_t> &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT32);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<int64_t> &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT64);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<std::string> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_STRING);
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<phi::capi::Scalar> &))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_LIST_SCALAR);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::Scalar &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_SCALAR);
+      } else if (arg_type ==
+                 std::type_index(typeid(const phi::capi::IntArray &))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_INT_ARRAY);
+      } else if (arg_type == std::type_index(typeid(PD_DataType))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_DATA_TYPE);
+      } else if (arg_type == std::type_index(typeid(PD_DataLayout))) {
+        attr_args_type.push_back(
+            PD_KernelArgumentType::PD_ARG_TYPE_DATA_LAYOUT);
+      } else if (arg_type == std::type_index(typeid(PD_DataLayout))) {
+        attr_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_PLACE);
+      } else if (arg_type ==
+                 std::type_index(typeid(phi::capi::DenseTensor *))) {
+        out_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_TENSOR);
+      } else if (arg_type == std::type_index(typeid(
+                                 std::vector<phi::capi::DenseTensor *>))) {
+        out_args_type.push_back(PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR);
+      }
+    }
+  }
+
+  std::vector<PD_KernelArgumentType> in_args_type;
+  std::vector<PD_KernelArgumentType> attr_args_type;
+  std::vector<PD_KernelArgumentType> out_args_type;
+
+ private:
+  template <std::size_t... INDEX>
+  static std::vector<std::type_index> ParseArgType(
+      std::index_sequence<INDEX...>) {
+    return {std::type_index(typeid(Arg<INDEX>))...};
+  }
+};
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/include/type_utils.h b/paddle/phi/capi/include/type_utils.h
new file mode 100644
index 0000000000000..ed892c881d715
--- /dev/null
+++ b/paddle/phi/capi/include/type_utils.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "paddle/phi/capi/include/c_data_type.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace capi {
+
+inline PD_DataType ToPDDataType(::paddle::experimental::DataType dtype) {
+#define return_result(in, ret)               \
+  case ::paddle::experimental::DataType::in: \
+    return PD_DataType::ret
+  switch (dtype) {
+    return_result(UNDEFINED, UNDEFINED);
+    return_result(FLOAT64, FLOAT64);
+    return_result(FLOAT32, FLOAT32);
+    return_result(FLOAT16, FLOAT16);
+    return_result(BFLOAT16, BFLOAT16);
+    return_result(INT64, INT64);
+    return_result(INT32, INT32);
+    return_result(INT16, INT16);
+    return_result(INT8, INT8);
+    return_result(UINT64, UINT64);
+    return_result(UINT32, UINT32);
+    return_result(UINT16, UINT16);
+    return_result(UINT8, UINT8);
+    return_result(BOOL, BOOL);
+    default: {
+      PADDLE_THROW(
+          ::phi::errors::Unavailable("DataType %d is not supported.", dtype));
+    }
+  }
+#undef return_result
+}
+
+inline ::paddle::experimental::DataType ToPhiDataType(PD_DataType dtype) {
+#define return_result(in, ret) \
+  case PD_DataType::in:        \
+    return ::paddle::experimental::DataType::ret
+  switch (dtype) {
+    return_result(UNDEFINED, UNDEFINED);
+    return_result(FLOAT64, FLOAT64);
+    return_result(FLOAT32, FLOAT32);
+    return_result(FLOAT16, FLOAT16);
+    return_result(BFLOAT16, BFLOAT16);
+    return_result(INT64, INT64);
+    return_result(INT32, INT32);
+    return_result(INT16, INT16);
+    return_result(INT8, INT8);
+    return_result(UINT64, UINT64);
+    return_result(UINT32, UINT32);
+    return_result(UINT16, UINT16);
+    return_result(UINT8, UINT8);
+    return_result(BOOL, BOOL);
+    default: {
+      PADDLE_THROW(
+          ::phi::errors::Unavailable("DataType %d is not supported.", dtype));
+      return ::paddle::experimental::DataType::UNDEFINED;
+    }
+  }
+#undef return_result
+}
+
+inline PD_DataLayout ToPDDataLayout(::paddle::experimental::DataLayout layout) {
+#define return_result(in, ret)                 \
+  case ::paddle::experimental::DataLayout::in: \
+    return PD_DataLayout::ret
+  switch (layout) {
+    return_result(ANY, ANY);
+    return_result(NHWC, NHWC);
+    return_result(NCHW, NCHW);
+    return_result(NCDHW, NCDHW);
+    return_result(NDHWC, NDHWC);
+    default: {
+      PADDLE_THROW(::phi::errors::Unavailable("DataLayout %d is not supported.",
+                                              layout));
+      return PD_DataLayout::ANY;
+    }
+  }
+#undef return_result
+}
+
+inline ::paddle::experimental::DataLayout ToPhiDataLayout(
+    PD_DataLayout layout) {
+#define return_result(in, ret) \
+  case PD_DataLayout::in:      \
+    return ::paddle::experimental::DataLayout::ret
+  switch (layout) {
+    return_result(ANY, ANY);
+    return_result(NHWC, NHWC);
+    return_result(NCHW, NCHW);
+    return_result(NCDHW, NCDHW);
+    return_result(NDHWC, NDHWC);
+    default: {
+      PADDLE_THROW(::phi::errors::Unavailable("DataLayout %d is not supported.",
+                                              layout));
+      return ::paddle::experimental::DataLayout::ANY;
+    }
+  }
+#undef return_result
+}
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h
new file mode 100644
index 0000000000000..2b5421bc266cf
--- /dev/null
+++ b/paddle/phi/capi/include/wrapper_base.h
@@ -0,0 +1,497 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <vector>
+
+#include "paddle/phi/api/ext/exception.h"
+#include "paddle/phi/capi/include/c_device_context.h"
+#include "paddle/phi/capi/include/c_int_array.h"
+#include "paddle/phi/capi/include/c_kernel_context.h"
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+#include "paddle/phi/capi/include/c_kernel_registry.h"
+#include "paddle/phi/capi/include/c_place.h"
+#include "paddle/phi/capi/include/c_scalar.h"
+#include "paddle/phi/capi/include/c_tensor.h"
+#include "paddle/phi/capi/include/data_type.h"
+#include "paddle/utils/optional.h"
+
+#define PD_CHECK_STATUS(status) PD_CHECK(status == C_SUCCESS)
+
+namespace phi {
+
+namespace capi {
+
+using LoD = std::vector<std::vector<size_t>>;
+
+template <typename T>
+static inline PD_List PDListFromVector(std::vector<T>* vec) {
+  PD_List list;
+  list.data = reinterpret_cast<void*>(vec->data());
+  list.size = vec->size();
+  return list;
+}
+
+template <typename T>
+static inline std::vector<T> PDListToVector(PD_List list) {
+  return std::vector<T>(static_cast<T*>(list.data),
+                        static_cast<T*>(list.data) + list.size);
+}
+
+inline std::vector<int64_t> PD_TensorGetDims(PD_Tensor* tensor,
+                                             PD_Status* status) {
+  int64_t ndims = PD_TensorGetNumDims(tensor, status);
+  if (ndims > 0) {
+    std::vector<int64_t> shape(ndims);
+    for (int64_t i = 0; i < ndims; ++i) {
+      shape[i] = PD_TensorGetDim(tensor, i, status);
+    }
+    return shape;
+  }
+  return std::vector<int64_t>();
+}
+
+template <typename T>
+class WrapperBase {
+ public:
+  explicit WrapperBase(T* ptr, bool own = false) : data_(ptr), own_(own) {}
+
+  inline T* raw_data() const { return data_; }
+
+  inline bool own_data() const { return own_; }
+
+  inline void reset(const T* ptr) { data_ = ptr; }
+
+ private:
+  T* data_;
+  bool own_;
+};
+
+class DenseTensor : public WrapperBase<PD_Tensor> {
+ public:
+  DenseTensor() : WrapperBase(PD_NewTensor(), true) {}
+
+  explicit DenseTensor(PD_Tensor* tensor) : WrapperBase(tensor) {}
+
+  ~DenseTensor() {
+    if (own_data()) {
+      PD_DeleteTensor(raw_data());
+    }
+  }
+
+  bool valid() const {
+    C_Status status;
+    auto ret = PD_TensorIsValid(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return ret;
+  }
+
+  bool initialized() const {
+    C_Status status;
+    auto ret = PD_TensorIsInitialized(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return ret;
+  }
+
+  void* Holder() const {
+    C_Status status;
+    auto holder = PD_TensorGetHolder(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return holder;
+  }
+
+  std::vector<int64_t> dims() const {
+    C_Status status;
+    auto dimension = PD_TensorGetDims(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return dimension;
+  }
+
+  PD_DataType dtype() const {
+    C_Status status;
+    auto data_type = PD_TensorGetDataType(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return data_type;
+  }
+
+  PD_DataLayout layout() const {
+    C_Status status;
+    auto data_layout = PD_TensorGetDataLayout(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return data_layout;
+  }
+
+  int64_t numel() const {
+    C_Status status;
+    auto element_count = PD_TensorGetElementCount(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return element_count;
+  }
+
+  int64_t memory_size() const {
+    C_Status status;
+    auto byte_size = PD_TensorGetByteSize(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return byte_size;
+  }
+
+  LoD lod() const {
+    PD_List data, offset;
+    C_Status status;
+    PD_TensorGetLoD(raw_data(), &data, &offset, &status);
+    PD_CHECK_STATUS(status);
+    LoD lod_;
+    auto ptr = static_cast<size_t*>(data.data);
+    auto offset_ptr = static_cast<size_t*>(offset.data);
+    for (size_t i = 0; i < offset.size - 1; ++i) {
+      lod_.emplace_back(ptr + offset_ptr[i], ptr + offset_ptr[i + 1]);
+    }
+    delete[] ptr;
+    delete[] offset_ptr;
+    return lod_;
+  }
+
+  void ResetLoD(const LoD& lod) {
+    std::vector<size_t> data, offset;
+    offset.push_back(0);
+    for (const auto& item : lod) {
+      data.insert(data.cend(), item.cbegin(), item.cend());
+      offset.push_back(item.size());
+    }
+    PD_List data_list, offset_list;
+    data_list = PDListFromVector(&data);
+    offset_list = PDListFromVector(&offset);
+
+    C_Status status;
+    PD_TensorResetLoD(raw_data(), data_list, offset_list, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void Resize(const std::vector<int64_t>& dims) {
+    C_Status status;
+    PD_TensorSetDims(raw_data(), dims.size(), dims.data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void set_dtype(PD_DataType data_type) {
+    C_Status status;
+    PD_TensorSetDataType(raw_data(), data_type, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void set_layout(PD_DataLayout data_layout) {
+    C_Status status;
+    PD_TensorSetDataLayout(raw_data(), data_layout, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  template <typename T>
+  T* data() const {
+    C_Status status;
+    auto ptr = PD_TensorGetDataPointer(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<T*>(ptr);
+  }
+
+  // template <typename T>
+  // T* mutable_data(int64_t size = 0, const PD_DeviceContext* ctx = nullptr) {
+  //   C_Status status;
+  //   auto ptr = PD_DeviceContextAllocateTensor(
+  //       ctx, raw_data(), size, phi::capi::CppTypeToPDType<T>::Type(),
+  //       &status);
+  //   PD_CHECK_STATUS(status);
+  //   return static_cast<T*>(ptr);
+  // }
+
+  // void* mutable_data(PD_DataType data_type,
+  //                    int64_t size = 0,
+  //                    const PD_DeviceContext* ctx = nullptr) {
+  //   C_Status status;
+  //   auto ptr = PD_DeviceContextAllocateTensor(
+  //       ctx, raw_data(), size, data_type, &status);
+  //   PD_CHECK_STATUS(status);
+  //   return static_cast<void*>(ptr);
+  // }
+
+  DenseTensor& ShareDataWith(const DenseTensor& src) {
+    C_Status status;
+    PD_TensorShareDataWith(raw_data(), src.raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return *this;
+  }
+
+  void share_lod(const DenseTensor& src) {
+    C_Status status;
+    PD_TensorShareLoDWith(raw_data(), src.raw_data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+};
+
+class DeviceContext : public WrapperBase<PD_DeviceContext> {
+ public:
+  explicit DeviceContext(PD_DeviceContext* context)
+      : WrapperBase<PD_DeviceContext>(context) {}
+
+  void* stream() const {
+    C_Status status;
+    auto stream_ = PD_DeviceContextGetStream(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return stream_;
+  }
+
+  void* Alloc(DenseTensor* tensor,
+              PD_DataType dtype,
+              int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr = PD_DeviceContextAllocateTensor(
+        raw_data(), tensor->raw_data(), requested_size, dtype, &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<void*>(ptr);
+  }
+
+  template <typename T>
+  T* Alloc(DenseTensor* tensor, int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr =
+        PD_DeviceContextAllocateTensor(raw_data(),
+                                       tensor->raw_data(),
+                                       requested_size,
+                                       phi::capi::CppTypeToPDType<T>::Type(),
+                                       &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<T*>(ptr);
+  }
+
+  void* HostAlloc(DenseTensor* tensor,
+                  PD_DataType dtype,
+                  int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr = PD_DeviceContextAllocateTensor(
+        nullptr, tensor->raw_data(), requested_size, dtype, &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<void*>(ptr);
+  }
+
+  template <typename T>
+  T* HostAlloc(DenseTensor* tensor, int64_t requested_size = 0) const {
+    C_Status status;
+    auto ptr =
+        PD_DeviceContextAllocateTensor(nullptr,
+                                       tensor->raw_data(),
+                                       requested_size,
+                                       phi::capi::CppTypeToPDType<T>::Type(),
+                                       &status);
+    PD_CHECK_STATUS(status);
+    return static_cast<T*>(ptr);
+  }
+};
+
+class Scalar : public WrapperBase<PD_Scalar> {
+ public:
+  explicit Scalar(PD_Scalar* scalar) : WrapperBase<PD_Scalar>(scalar) {}
+
+  PD_DataType dtype() const { return PD_ScalarGetDataType(raw_data()); }
+
+  template <typename T>
+  inline T to() const;
+};
+
+template <>
+inline bool Scalar::to<bool>() const {
+  return PD_ScalarGetBoolData(raw_data());
+}
+
+template <>
+inline float Scalar::to<float>() const {
+  return PD_ScalarGetFloat32Data(raw_data());
+}
+
+template <>
+inline double Scalar::to<double>() const {
+  return PD_ScalarGetFloat64Data(raw_data());
+}
+
+template <>
+inline uint8_t Scalar::to<uint8_t>() const {
+  return PD_ScalarGetUInt8Data(raw_data());
+}
+
+template <>
+inline uint16_t Scalar::to<uint16_t>() const {
+  return PD_ScalarGetUInt16Data(raw_data());
+}
+
+template <>
+inline uint32_t Scalar::to<uint32_t>() const {
+  return PD_ScalarGetUInt32Data(raw_data());
+}
+
+template <>
+inline uint64_t Scalar::to<uint64_t>() const {
+  return PD_ScalarGetUInt64Data(raw_data());
+}
+
+template <>
+inline int8_t Scalar::to<int8_t>() const {
+  return PD_ScalarGetInt8Data(raw_data());
+}
+
+template <>
+inline int16_t Scalar::to<int16_t>() const {
+  return PD_ScalarGetInt16Data(raw_data());
+}
+
+template <>
+inline int32_t Scalar::to<int32_t>() const {
+  return PD_ScalarGetInt32Data(raw_data());
+}
+
+template <>
+inline int64_t Scalar::to<int64_t>() const {
+  return PD_ScalarGetInt64Data(raw_data());
+}
+
+class IntArray : WrapperBase<PD_IntArray> {
+ public:
+  explicit IntArray(PD_IntArray* int_array)
+      : WrapperBase<PD_IntArray>(int_array) {}
+
+  size_t size() const { return PD_IntArrayGetElementCount(raw_data()); }
+
+  std::vector<int64_t> GetData() const {
+    auto list = PD_IntArrayGetDataPointer(raw_data());
+    auto data = reinterpret_cast<int64_t*>(list.data);
+    std::vector<int64_t> ret(data, data + list.size);
+    return ret;
+  }
+};
+
+class Place : WrapperBase<PD_Place> {
+ public:
+  explicit Place(PD_Place* place) : WrapperBase<PD_Place>(place) {}
+
+  bool is_host() { return PD_PlaceIsHost(raw_data()); }
+
+  int8_t GetDeviceID() { return PD_PlaceGetDeviceId(raw_data()); }
+};
+
+class TensorArgDef : WrapperBase<PD_TensorArgDef> {
+ public:
+  explicit TensorArgDef(PD_TensorArgDef* tensor_arg_def)
+      : WrapperBase<PD_TensorArgDef>(tensor_arg_def) {}
+
+  // TensorArgDef& SetBackend() {
+  //   return *this;
+  // }
+
+  TensorArgDef& SetDataLayout(PD_DataLayout in_layout) {
+    C_Status status;
+    PD_TensorArgDefSetDataLayout(raw_data(), in_layout, &status);
+    PD_CHECK_STATUS(status);
+    return *this;
+  }
+
+  TensorArgDef& SetDataType(PD_DataType in_dtype) {
+    C_Status status;
+    PD_TensorArgDefSetDataType(raw_data(), in_dtype, &status);
+    PD_CHECK_STATUS(status);
+    return *this;
+  }
+};
+
+class KernelArgsDef : WrapperBase<PD_KernelArgsDef> {
+ public:
+  explicit KernelArgsDef(PD_KernelArgsDef* kernel_args_def)
+      : WrapperBase<PD_KernelArgsDef>(kernel_args_def) {}
+
+  std::vector<TensorArgDef> input_defs() {
+    C_Status status;
+    auto list = PD_KernelArgsDefGetInputArgDefs(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    auto ptr = reinterpret_cast<PD_TensorArgDef**>(list.data);
+    std::vector<TensorArgDef> ret;
+    for (size_t i = 0; i < list.size; ++i) {
+      ret.emplace_back(ptr[i]);
+    }
+    PD_DeletePointerList(list);
+    return ret;
+  }
+
+  std::vector<TensorArgDef> output_defs() {
+    C_Status status;
+    auto list = PD_KernelArgsDefGetOutputArgDefs(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    auto ptr = reinterpret_cast<PD_TensorArgDef**>(list.data);
+    std::vector<TensorArgDef> ret;
+    for (size_t i = 0; i < list.size; ++i) {
+      ret.emplace_back(ptr[i]);
+    }
+    PD_DeletePointerList(list);
+    return ret;
+  }
+
+  // std::vector<AttributeArgDef>
+  // attribute_defs() {
+  // }
+};
+
+class KernelKey : WrapperBase<PD_KernelKey> {
+ public:
+  explicit KernelKey(PD_KernelKey* kernel_key)
+      : WrapperBase<PD_KernelKey>(kernel_key) {}
+
+  // Backend backend() const { return backend_; }
+  PD_DataLayout layout() const {
+    PD_Status status;
+    auto layout_ = PD_KernelKeyGetLayout(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return layout_;
+  }
+
+  PD_DataType dtype() const {
+    PD_Status status;
+    auto dtype_ = PD_KernelKeyGetDataType(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return dtype_;
+  }
+};
+
+class Kernel : WrapperBase<PD_Kernel> {
+ public:
+  explicit Kernel(PD_Kernel* kernel) : WrapperBase<PD_Kernel>(kernel) {}
+
+  KernelArgsDef args_def() const {
+    C_Status status;
+    auto ptr = PD_KernelGetArgsDef(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return KernelArgsDef(ptr);
+  }
+
+  TensorArgDef InputAt(size_t idx) { return args_def().input_defs()[idx]; }
+
+  TensorArgDef OutputAt(size_t idx) { return args_def().input_defs()[idx]; }
+};
+
+}  // namespace capi
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/capi/lib/CMakeLists.txt b/paddle/phi/capi/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..de335bb668bdf
--- /dev/null
+++ b/paddle/phi/capi/lib/CMakeLists.txt
@@ -0,0 +1,44 @@
+cc_library(
+  phi_c_data_type
+  SRCS c_data_type.cc
+  DEPS dense_tensor)
+
+cc_library(
+  phi_c_device_context
+  SRCS c_device_context.cc
+  DEPS phi_context)
+
+cc_library(
+  phi_c_int_array
+  SRCS c_int_array.cc
+  DEPS int_array)
+
+cc_library(
+  phi_c_kernel_context
+  SRCS c_kernel_context.cc
+  DEPS kernel_context)
+
+cc_library(
+  phi_c_kernel_factory
+  SRCS c_kernel_factory.cc
+  DEPS kernel_factory)
+
+cc_library(
+  phi_c_kernel_registry
+  SRCS c_kernel_registry.cc
+  DEPS dense_tensor)
+
+cc_library(
+  phi_c_place
+  SRCS c_place.cc
+  DEPS phi_place)
+
+cc_library(
+  phi_c_scalar
+  SRCS c_scalar.cc
+  DEPS scalar)
+
+cc_library(
+  phi_c_tensor
+  SRCS c_tensor.cc
+  DEPS dense_tensor)
diff --git a/paddle/phi/capi/lib/c_data_type.cc b/paddle/phi/capi/lib/c_data_type.cc
new file mode 100644
index 0000000000000..547df06338f0f
--- /dev/null
+++ b/paddle/phi/capi/lib/c_data_type.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_data_type.h"
+
+#include "paddle/phi/capi/include/common.h"
+
+void PD_DeletePointerList(PD_List list) {
+  auto data = reinterpret_cast<void**>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteUInt8List(PD_List list) {
+  auto data = reinterpret_cast<uint8_t*>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteInt64List(PD_List list) {
+  auto data = reinterpret_cast<int64_t*>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteInt32List(PD_List list) {
+  auto data = reinterpret_cast<int32_t*>(list.data);
+  delete[] data;
+}
+
+void PD_DeleteFloat64List(PD_List list) {
+  auto data = reinterpret_cast<double*>(list.data);
+  if (data) delete[] data;
+}
+
+void PD_DeleteFloat32List(PD_List list) {
+  auto data = reinterpret_cast<float*>(list.data);
+  if (data) delete[] data;
+}
+
+PD_REGISTER_CAPI(data_type);
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
new file mode 100644
index 0000000000000..96b46fbc0d4ff
--- /dev/null
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_device_context.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
+                                    PD_Status* status) {
+  if (status) {
+    if (!ctx) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+  auto dev_ctx_type =
+      reinterpret_cast<const phi::CustomContext*>(ctx)->GetPlace().GetType();
+  if (dev_ctx_type == phi::AllocationType::CUSTOM) {
+    return reinterpret_cast<PD_Stream>(
+        reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
+  } else if (dev_ctx_type == phi::AllocationType::CPU) {
+    return nullptr;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (dev_ctx_type == phi::AllocationType::GPU) {
+    return reinterpret_cast<PD_Stream>(
+        reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
+#endif
+#ifdef PADDLE_WITH_XPU
+  } else if (dev_ctx_type == phi::AllocationType::XPU) {
+    return nullptr;
+#endif
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Only support Custom/CPU/GPU/XPU DeviceContext"));
+  }
+}
+
+void* PD_DeviceContextAllocateTensor(const PD_DeviceContext* ctx,
+                                     PD_Tensor* tensor,
+                                     size_t size,
+                                     PD_DataType dtype,
+                                     PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto dev_ctx = reinterpret_cast<const phi::DeviceContext*>(ctx);
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  auto phi_dtype = phi::capi::ToPhiDataType(dtype);
+  if (ctx) {
+    return dev_ctx->Alloc(cc_tensor, phi_dtype, size);
+  } else {
+    auto place = phi::CPUPlace();
+    return cc_tensor->mutable_data(place, phi_dtype, size);
+  }
+}
+
+PD_REGISTER_CAPI(device_context);
diff --git a/paddle/phi/capi/lib/c_int_array.cc b/paddle/phi/capi/lib/c_int_array.cc
new file mode 100644
index 0000000000000..7562700372c3b
--- /dev/null
+++ b/paddle/phi/capi/lib/c_int_array.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_int_array.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/common/int_array.h"
+
+PD_List PD_IntArrayGetDataPointer(PD_IntArray* int_array) {
+  auto cc_int_array = reinterpret_cast<phi::IntArray*>(int_array);
+  const auto& data = cc_int_array->GetData();
+  PD_List list;
+  list.size = data.size();
+  list.data = const_cast<int64_t*>(data.data());
+  return list;
+}
+
+size_t PD_IntArrayGetSize(PD_IntArray* int_array) {
+  auto cc_int_array = reinterpret_cast<phi::IntArray*>(int_array);
+  return cc_int_array->size();
+}
+
+PD_REGISTER_CAPI(int_array);
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
new file mode 100644
index 0000000000000..2e14b019c19ff
--- /dev/null
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -0,0 +1,223 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_kernel_context.h"
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/kernel_context.h"
+
+PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  auto dev_ctx_type = kernel_context->GetDeviceContext<phi::DeviceContext>()
+                          .GetPlace()
+                          .GetType();
+  if (dev_ctx_type == phi::AllocationType::CUSTOM) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CustomContext*>(
+        &kernel_context->GetDeviceContext<phi::CustomContext>()));
+  } else if (dev_ctx_type == phi::AllocationType::CPU) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
+        &kernel_context->GetDeviceContext<phi::CPUContext>()));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  } else if (dev_ctx_type == phi::AllocationType::GPU) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
+        &kernel_context->GetDeviceContext<phi::GPUContext>()));
+#endif
+#ifdef PADDLE_WITH_XPU
+  } else if (dev_ctx_type == phi::AllocationType::XPU) {
+    return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::XPUContext*>(
+        &kernel_context->GetDeviceContext<phi::XPUContext>()));
+#endif
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Only support Custom/CPU/GPU/XPU DeviceContext"));
+  }
+}
+
+PD_Tensor* PD_KernelContextInputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->InputRangeAt(index);
+  return reinterpret_cast<PD_Tensor*>(const_cast<phi::DenseTensor*>(
+      &kernel_context->InputAt<phi::DenseTensor>(range.first)));
+}
+
+PD_List PD_KernelContextMultiInputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->InputRangeAt(index);
+  auto tensor_vec = kernel_context->InputsBetween<phi::DenseTensor>(
+      range.first, range.second);
+  PD_List list;
+  list.size = tensor_vec.size();
+  list.data = tensor_vec.data();
+  return list;
+}
+
+PD_Tensor* PD_KernelContextOutputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->OutputRangeAt(index);
+  return reinterpret_cast<PD_Tensor*>(
+      kernel_context->MutableOutputAt<phi::DenseTensor>(range.first));
+}
+
+PD_List PD_KernelContextMultiOutputAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const std::pair<int, int>& range = kernel_context->OutputRangeAt(index);
+  auto tensor_vec = kernel_context->MutableOutputBetween<phi::DenseTensor>(
+      range.first, range.second);
+  PD_List list;
+  list.size = tensor_vec.size();
+  list.data = tensor_vec.data();
+  return list;
+}
+
+bool PD_KernelContextBoolAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<bool>(index);
+}
+
+int32_t PD_KernelContextInt32AttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<int32_t>(index);
+}
+
+int64_t PD_KernelContextInt64AttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<int64_t>(index);
+}
+
+float PD_KernelContextFloatAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<float>(index);
+}
+
+double PD_KernelContextDoubleAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return kernel_context->AttrAt<double>(index);
+}
+
+PD_Scalar* PD_KernelContextScalarAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return reinterpret_cast<PD_Scalar*>(
+      const_cast<phi::Scalar*>(&kernel_context->AttrAt<phi::Scalar>(index)));
+}
+
+PD_IntArray* PD_KernelContextIntArrayAttrAt(PD_KernelContext* ctx,
+                                            size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return reinterpret_cast<PD_IntArray*>(const_cast<phi::IntArray*>(
+      &kernel_context->AttrAt<phi::IntArray>(index)));
+}
+
+PD_List PD_KernelContextListBoolAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<bool>>(index);
+  list.size = cc_list.size();
+  auto data = reinterpret_cast<uint8_t*>(new uint8_t[cc_list.size()]);
+  for (size_t i = 0; i < cc_list.size(); ++i) {
+    data[i] = static_cast<uint8_t>(cc_list[i]);
+  }
+  list.data = data;
+  return list;
+}
+
+PD_List PD_KernelContextListInt32AttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<int32_t>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<int32_t*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_KernelContextListInt64AttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<int64_t>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<int64_t*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_KernelContextListFloatAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<float>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<float*>(cc_list.data());
+  return list;
+}
+
+PD_List PD_KernelContextListDoubleAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<double>>(index);
+  list.size = cc_list.size();
+  list.data = const_cast<double*>(cc_list.data());
+  return list;
+}
+
+char* PD_KernelContextStringAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return const_cast<char*>(kernel_context->AttrAt<std::string>(index).data());
+}
+
+PD_List PD_KernelContextListStringAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<std::string>>(index);
+  list.size = cc_list.size();
+  auto data = new char*[list.size];
+  for (size_t i = 0; i < list.size; ++i) {
+    data[i] = const_cast<char*>(cc_list[i].data());
+  }
+  list.data = reinterpret_cast<void*>(data);
+  return list;
+}
+
+PD_List PD_KernelContextListScalarAttrAt(PD_KernelContext* ctx, size_t index) {
+  PD_List list;
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  const auto& cc_list = kernel_context->AttrAt<std::vector<phi::Scalar>>(index);
+  list.size = cc_list.size();
+  auto data = new PD_Scalar*[list.size];
+  for (size_t i = 0; i < list.size; ++i) {
+    data[i] =
+        const_cast<PD_Scalar*>(reinterpret_cast<const PD_Scalar*>(&cc_list[i]));
+  }
+  list.data = data;
+  return list;
+}
+
+PD_Place* PD_KernelContextPlaceAttrAt(PD_KernelContext* ctx, size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return reinterpret_cast<PD_Place*>(
+      const_cast<phi::Place*>(&kernel_context->AttrAt<phi::Place>(index)));
+}
+
+PD_DataType PD_KernelContextDataTypeAttrAt(PD_KernelContext* ctx,
+                                           size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return phi::capi::ToPDDataType(kernel_context->AttrAt<phi::DataType>(index));
+}
+
+PD_DataLayout PD_KernelContextDataLayoutAttrAt(PD_KernelContext* ctx,
+                                               size_t index) {
+  auto kernel_context = reinterpret_cast<phi::KernelContext*>(ctx);
+  return phi::capi::ToPDDataLayout(
+      kernel_context->AttrAt<phi::DataLayout>(index));
+}
+
+PD_REGISTER_CAPI(kernel_context);
diff --git a/paddle/phi/capi/lib/c_kernel_factory.cc b/paddle/phi/capi/lib/c_kernel_factory.cc
new file mode 100644
index 0000000000000..8bf94467b472a
--- /dev/null
+++ b/paddle/phi/capi/lib/c_kernel_factory.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_kernel_factory.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+/**
+ * TensorArgDef
+ */
+
+void PD_TensorArgDefSetDataLayout(PD_TensorArgDef* def,
+                                  PD_DataLayout layout,
+                                  PD_Status* status) {
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_def = reinterpret_cast<phi::TensorArgDef*>(def);
+  cc_def->SetDataLayout(phi::capi::ToPhiDataLayout(layout));
+}
+
+void PD_TensorArgDefSetDataType(PD_TensorArgDef* def,
+                                PD_DataType dtype,
+                                PD_Status* status) {
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_def = reinterpret_cast<phi::TensorArgDef*>(def);
+  cc_def->SetDataType(phi::capi::ToPhiDataType(dtype));
+}
+
+/**
+ * KernelArgsDef
+ */
+
+PD_List PD_KernelArgsDefGetInputArgDefs(PD_KernelArgsDef* def,
+                                        PD_Status* status) {
+  PD_List list;
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      list.size = 0;
+      list.data = nullptr;
+      return list;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_def = reinterpret_cast<phi::KernelArgsDef*>(def);
+  auto& arg_defs = cc_def->input_defs();
+  list.size = arg_defs.size();
+  auto ptr = new PD_TensorArgDef*[list.size];
+  list.data = ptr;
+  for (size_t i = 0; i < list.size; ++i) {
+    ptr[i] = reinterpret_cast<PD_TensorArgDef*>(&arg_defs[i]);
+  }
+  return list;
+}
+
+PD_List PD_KernelArgsDefGetOutputArgDefs(PD_KernelArgsDef* def,
+                                         PD_Status* status) {
+  PD_List list;
+  if (status) {
+    if (!def) {
+      *status = C_FAILED;
+      list.size = 0;
+      list.data = nullptr;
+      return list;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_def = reinterpret_cast<phi::KernelArgsDef*>(def);
+  auto& arg_defs = cc_def->output_defs();
+  list.size = arg_defs.size();
+  auto ptr = new PD_TensorArgDef*[list.size];
+  list.data = ptr;
+  for (size_t i = 0; i < list.size; ++i) {
+    ptr[i] = reinterpret_cast<PD_TensorArgDef*>(&arg_defs[i]);
+  }
+  return list;
+}
+
+/**
+ * KernelKey
+ */
+
+PD_DataLayout PD_KernelKeyGetLayout(PD_KernelKey* key, PD_Status* status) {
+  if (status) {
+    if (!key) {
+      *status = C_FAILED;
+      return PD_DataLayout::ALL_LAYOUT;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_key = reinterpret_cast<phi::KernelKey*>(key);
+  return phi::capi::ToPDDataLayout(cc_key->layout());
+}
+
+PD_DataType PD_KernelKeyGetDataType(PD_KernelKey* key, PD_Status* status) {
+  if (status) {
+    if (!key) {
+      *status = C_FAILED;
+      return PD_DataType::UNDEFINED;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_key = reinterpret_cast<phi::KernelKey*>(key);
+  return phi::capi::ToPDDataType(cc_key->dtype());
+}
+
+/**
+ * Kernel
+ */
+
+PD_KernelArgsDef* PD_KernelGetArgsDef(PD_Kernel* kernel, PD_Status* status) {
+  if (status) {
+    if (!kernel) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_kernel = reinterpret_cast<phi::Kernel*>(kernel);
+  return reinterpret_cast<PD_KernelArgsDef*>(
+      const_cast<phi::KernelArgsDef*>(&cc_kernel->args_def()));
+}
+
+PD_REGISTER_CAPI(kernel_factory);
diff --git a/paddle/phi/capi/lib/c_kernel_registry.cc b/paddle/phi/capi/lib/c_kernel_registry.cc
new file mode 100644
index 0000000000000..6cf6208856bfa
--- /dev/null
+++ b/paddle/phi/capi/lib/c_kernel_registry.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_kernel_registry.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+void PD_KernelArgsParseFn(const phi::KernelKey& default_key,
+                          phi::KernelArgsDef* args_def,
+                          size_t in_nargs,
+                          PD_KernelArgumentType* in_args_type,
+                          size_t attr_nargs,
+                          PD_KernelArgumentType* attr_args_type,
+                          size_t out_nargs,
+                          PD_KernelArgumentType* out_args_type) {
+  auto default_tensor_layout = phi::DataLayout::NCHW;
+  if (default_key.layout() != phi::DataLayout::ANY) {
+    default_tensor_layout = default_key.layout();
+  }
+  // inputs
+  for (size_t i = 0; i < in_nargs; ++i) {
+    auto arg_type = in_args_type[i];
+    if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_CONTEXT) {
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_TENSOR) {
+      args_def->AppendInput(default_key.backend(),
+                            default_tensor_layout,
+                            default_key.dtype(),
+                            std::type_index(typeid(const phi::DenseTensor&)));
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_TENSOR) {
+      args_def->AppendInput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(const paddle::optional<phi::DenseTensor>&)));
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR) {
+      args_def->AppendInput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(const std::vector<const phi::DenseTensor*>&)));
+    } else if (arg_type ==
+               PD_KernelArgumentType::PD_ARG_TYPE_OPTIONAL_MULTI_TENSOR) {
+      args_def->AppendInput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(
+              const paddle::optional<std::vector<const phi::DenseTensor*>>&)));
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "PD_KernelArgumentType %d is not supported.", arg_type));
+    }
+  }
+  // attributes
+  for (size_t i = 0; i < attr_nargs; ++i) {
+    auto arg_type = attr_args_type[i];
+    if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_BOOL) {
+      args_def->AppendAttribute(phi::AttributeType::BOOL);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_FLOAT32) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT32);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_FLOAT64) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT64);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_INT32) {
+      args_def->AppendAttribute(phi::AttributeType::INT32);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_INT64) {
+      args_def->AppendAttribute(phi::AttributeType::INT64);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_STRING) {
+      args_def->AppendAttribute(phi::AttributeType::STRING);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_SCALAR) {
+      args_def->AppendAttribute(phi::AttributeType::SCALAR);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_INT_ARRAY) {
+      args_def->AppendAttribute(phi::AttributeType::INT_ARRAY);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_DATA_TYPE) {
+      args_def->AppendAttribute(phi::AttributeType::DATA_TYPE);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_DATA_LAYOUT) {
+      args_def->AppendAttribute(phi::AttributeType::DATA_LAYOUT);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_PLACE) {
+      args_def->AppendAttribute(phi::AttributeType::PLACE);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_BOOL) {
+      args_def->AppendAttribute(phi::AttributeType::BOOLS);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT32) {
+      args_def->AppendAttribute(phi::AttributeType::INT32S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_INT64) {
+      args_def->AppendAttribute(phi::AttributeType::INT64S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT32) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT32S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_FLOAT64) {
+      args_def->AppendAttribute(phi::AttributeType::FLOAT64S);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_STRING) {
+      args_def->AppendAttribute(phi::AttributeType::STRINGS);
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_SCALAR) {
+      args_def->AppendAttribute(phi::AttributeType::SCALARS);
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "PD_KernelArgumentType %d is not supported.", arg_type));
+    }
+  }
+  // outputs
+  for (size_t i = 0; i < out_nargs; ++i) {
+    auto arg_type = out_args_type[i];
+    if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_TENSOR) {
+      args_def->AppendOutput(default_key.backend(),
+                             default_tensor_layout,
+                             default_key.dtype(),
+                             std::type_index(typeid(phi::DenseTensor*)));
+    } else if (arg_type == PD_KernelArgumentType::PD_ARG_TYPE_LIST_TENSOR) {
+      args_def->AppendOutput(
+          default_key.backend(),
+          default_tensor_layout,
+          default_key.dtype(),
+          std::type_index(typeid(std::vector<phi::DenseTensor*>)));
+    } else {
+      PADDLE_THROW(phi::errors::Unavailable(
+          "PD_KernelArgumentType %d is not supported.", arg_type));
+    }
+  }
+}
+
+void PD_RegisterPhiKernel(const char* kernel_name_cstr,
+                          const char* backend_cstr,
+                          PD_DataType pd_dtype,
+                          PD_DataLayout pd_layout,
+                          size_t in_nargs,
+                          PD_KernelArgumentType* in_args_type,
+                          size_t attr_nargs,
+                          PD_KernelArgumentType* attr_args_type,
+                          size_t out_nargs,
+                          PD_KernelArgumentType* out_args_type,
+                          void (*args_def_fn)(const PD_KernelKey*, PD_Kernel*),
+                          void (*fn)(PD_KernelContext*),
+                          void* variadic_kernel_fn) {
+  auto args_def_fn_wrapper = [args_def_fn](const phi::KernelKey& kernel_key,
+                                           phi::Kernel* kernel) {
+    args_def_fn(reinterpret_cast<const PD_KernelKey*>(&kernel_key),
+                reinterpret_cast<PD_Kernel*>(kernel));
+  };
+  phi::KernelFn kernel_fn = [fn](phi::KernelContext* ctx) {
+    fn(reinterpret_cast<PD_KernelContext*>(ctx));
+  };
+  std::string kernel_name(kernel_name_cstr);
+
+  auto dtype = phi::capi::ToPhiDataType(pd_dtype);
+  auto layout = phi::capi::ToPhiDataLayout(pd_layout);
+  phi::KernelKey kernel_key(
+      paddle::experimental::StringToBackend(backend_cstr), layout, dtype);
+
+  phi::Kernel kernel(kernel_fn, variadic_kernel_fn);
+  PD_KernelArgsParseFn(kernel_key,
+                       kernel.mutable_args_def(),
+                       in_nargs,
+                       in_args_type,
+                       attr_nargs,
+                       attr_args_type,
+                       out_nargs,
+                       out_args_type);
+
+  args_def_fn_wrapper(kernel_key, &kernel);
+  phi::KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+}
+
+PD_REGISTER_CAPI(kernel_registry);
diff --git a/paddle/phi/capi/lib/c_place.cc b/paddle/phi/capi/lib/c_place.cc
new file mode 100644
index 0000000000000..cccccbbb259f3
--- /dev/null
+++ b/paddle/phi/capi/lib/c_place.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_place.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/common/place.h"
+
+bool PD_PlaceIsHost(PD_Place* place) {
+  auto cc_place = reinterpret_cast<phi::Place*>(place);
+  return cc_place->GetType() == phi::AllocationType::CPU;
+}
+
+int8_t PD_PlaceGetDeviceId(PD_Place* place) {
+  auto cc_place = reinterpret_cast<phi::Place*>(place);
+  return cc_place->GetDeviceId();
+}
+
+PD_REGISTER_CAPI(place);
diff --git a/paddle/phi/capi/lib/c_scalar.cc b/paddle/phi/capi/lib/c_scalar.cc
new file mode 100644
index 0000000000000..655465c8f848f
--- /dev/null
+++ b/paddle/phi/capi/lib/c_scalar.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_scalar.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/common/scalar.h"
+
+PD_DataType PD_ScalarGetType(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return phi::capi::ToPDDataType(cc_scalar->dtype());
+}
+
+bool PD_ScalarGetBoolData(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<bool>();
+}
+
+int8_t PD_ScalarGetInt8Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int8_t>();
+}
+
+int16_t PD_ScalarGetInt16Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int16_t>();
+}
+
+int32_t PD_ScalarGetInt32Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int32_t>();
+}
+
+int64_t PD_ScalarGetInt64Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<int64_t>();
+}
+
+uint8_t PD_ScalarGetUInt8Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint8_t>();
+}
+
+uint16_t PD_ScalarGetUInt16Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint16_t>();
+}
+
+uint32_t PD_ScalarGetUInt32Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint32_t>();
+}
+
+uint64_t PD_ScalarGetUInt64Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<uint64_t>();
+}
+
+float PD_ScalarGetFloat32Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<float>();
+}
+
+double PD_ScalarGetFloat64Data(PD_Scalar* scalar) {
+  auto cc_scalar = reinterpret_cast<phi::Scalar*>(scalar);
+  return cc_scalar->to<double>();
+}
+
+PD_REGISTER_CAPI(scalar);
diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc
new file mode 100644
index 0000000000000..cd0bbd62d88a0
--- /dev/null
+++ b/paddle/phi/capi/lib/c_tensor.cc
@@ -0,0 +1,302 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/capi/include/c_tensor.h"
+
+#include "paddle/phi/capi/include/common.h"
+#include "paddle/phi/capi/include/type_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+PD_DataType PD_TensorGetDataType(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return PD_DataType::UNDEFINED;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return phi::capi::ToPDDataType(cc_tensor->dtype());
+}
+
+PD_DataLayout PD_TensorGetDataLayout(const PD_Tensor* tensor,
+                                     PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return PD_DataLayout::ALL_LAYOUT;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return phi::capi::ToPDDataLayout(cc_tensor->layout());
+}
+
+int64_t PD_TensorGetByteSize(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->memory_size();
+}
+
+void* PD_TensorGetDataPointer(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return const_cast<void*>(cc_tensor->data());
+}
+
+int64_t PD_TensorGetElementCount(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->numel();
+}
+
+int64_t PD_TensorGetNumDims(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->dims().size();
+}
+
+int64_t PD_TensorGetDim(const PD_Tensor* tensor,
+                        size_t index,
+                        PD_Status* status) {
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+
+  if (status) {
+    if (!tensor || index >= static_cast<size_t>(cc_tensor->dims().size())) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  return cc_tensor->dims()[index];
+}
+
+void PD_TensorGetLoD(const PD_Tensor* tensor,
+                     PD_List* data,
+                     PD_List* offset,
+                     PD_Status* status) {
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+
+  if (status) {
+    if (!tensor || !data || !offset) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto lod = cc_tensor->lod();
+  offset->size = lod.size() + 1;
+  auto offset_data = new size_t[offset->size];
+  offset->data = offset_data;
+  offset_data[0] = 0;
+
+  size_t sz = 0;
+  for (size_t i = 0; i < lod.size(); ++i) {
+    offset_data[i + 1] = lod[i].size() + offset_data[i];
+    sz += lod[i].size();
+  }
+
+  auto data_ptr = new size_t[sz];
+  data->data = data_ptr;
+  data->size = sz;
+  for (size_t i = 0; i < lod.size(); ++i) {
+    memcpy(data_ptr, lod[i].data(), lod[i].size() * sizeof(size_t));
+    data_ptr += lod[i].size();
+  }
+}
+
+bool PD_TensorIsInitialized(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return false;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->initialized();
+}
+
+bool PD_TensorIsValid(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return false;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->valid();
+}
+
+void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return nullptr;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->Holder().get();
+}
+
+void PD_TensorSetDims(PD_Tensor* tensor,
+                      int64_t ndims,
+                      const int64_t* dims,
+                      PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  std::vector<int> shape(dims, dims + ndims);
+  cc_tensor->Resize(phi::make_ddim(shape));
+}
+
+void PD_TensorSetDataType(PD_Tensor* tensor,
+                          PD_DataType dtype,
+                          PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->set_type(phi::capi::ToPhiDataType(dtype));
+}
+
+void PD_TensorSetDataLayout(PD_Tensor* tensor,
+                            PD_DataLayout layout,
+                            PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->set_layout(phi::capi::ToPhiDataLayout(layout));
+}
+
+void PD_TensorResetLoD(PD_Tensor* tensor,
+                       PD_List data,
+                       PD_List offset,
+                       PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  phi::LoD lod;
+  auto offset_ptr = static_cast<size_t*>(offset.data);
+  auto data_ptr = static_cast<size_t*>(data.data);
+
+  for (size_t i = 0; i < offset.size - 1; ++i) {
+    lod.emplace_back(data_ptr + offset_ptr[i], data_ptr + offset_ptr[i + 1]);
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->ResetLoD(lod);
+}
+
+PD_Tensor* PD_NewTensor() {
+  return reinterpret_cast<PD_Tensor*>(new phi::DenseTensor());
+}
+
+void PD_DeleteTensor(PD_Tensor* tensor) {
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  delete cc_tensor;
+}
+
+void PD_TensorShareDataWith(PD_Tensor* dst,
+                            const PD_Tensor* src,
+                            PD_Status* status) {
+  if (status) {
+    if (!dst || !src) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_dst_tensor = reinterpret_cast<phi::DenseTensor*>(dst);
+  auto cc_src_tensor = reinterpret_cast<const phi::DenseTensor*>(src);
+  cc_dst_tensor->ShareDataWith(*cc_src_tensor);
+}
+
+void PD_TensorShareLoDWith(PD_Tensor* dst,
+                           const PD_Tensor* src,
+                           PD_Status* status) {
+  if (status) {
+    if (!dst || !src) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_dst_tensor = reinterpret_cast<phi::DenseTensor*>(dst);
+  auto cc_src_tensor = const_cast<phi::DenseTensor*>(
+      reinterpret_cast<const phi::DenseTensor*>(src));
+
+  phi::MetaTensor meta_dst(cc_dst_tensor);
+  const phi::MetaTensor meta_src(cc_src_tensor);
+  meta_dst.share_lod(meta_src);
+}
+
+PD_REGISTER_CAPI(tensor);
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index daaf88a23950f..3e68462c88a5c 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -768,7 +768,7 @@ __global__ void VectorizedElementwiseKernel(
         ins, outs, data_offset, read_lens * BLOCK_NUM_X, read_lens, func);
   }
 
-  int remain = numel - data_offset;
+  kps::IndexType remain = numel - data_offset;
   if (remain > 0) {
     VectorizedElementwiseKernelImpl<OutT,
                                     Functor,
@@ -776,7 +776,7 @@ __global__ void VectorizedElementwiseKernel(
                                     NumOuts,
                                     VecSize,
                                     true>(
-        ins, outs, data_offset, remain, read_lens, func);
+        ins, outs, data_offset, static_cast<int>(remain), read_lens, func);
   }
 }
 
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 25f222546656f..033c50e537da6 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -277,6 +277,12 @@ void set_constant(const paddle::platform::DeviceContext& context,
                   paddle::framework::Tensor* tensor,
                   float value) {
   TensorSetConstantWithPlace func(context, tensor, value);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (paddle::platform::is_custom_place(context.GetPlace())) {
+    func(phi::CPUPlace());
+    return;
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // tensor->place().apply_visitor(func);
   paddle::platform::VisitPlace(tensor->place(), func);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9e4aac55f5d2d..181e9d92f0166 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3150,11 +3150,11 @@ function collect_ccache_hits() {
 
 function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
-    # Xreki 12538138, luotao1 6836917, ZzSean 32410583
+    # Xreki 12538138, luotao1 6836917, ZzSean 32410583, JamesLim-sy 61349199
     set +x
     approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
-        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
+        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917 61349199)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "TRUE" ]; then
             echo "==================================="
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 028fd57229e56..60dfde6b45c37 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -24,6 +24,7 @@
 import os
 import copy
 import numpy as np
+import tempfile
 from paddle.static.amp import decorate
 
 paddle.enable_static()
@@ -272,18 +273,25 @@ def infer(use_cuda, save_dirname=None):
                                       clip_extra=True)
 
 
-def main(net_type, use_cuda, is_local=True):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+class TestImageClassification(unittest.TestCase):
 
-    # Directory for saving the trained model
-    save_dirname = "image_classification_" + net_type + ".inference.model"
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
 
-    train(net_type, use_cuda, save_dirname, is_local)
-    #infer(use_cuda, save_dirname)
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
+    def main(self, net_type, use_cuda, is_local=True):
+        if use_cuda and not fluid.core.is_compiled_with_cuda():
+            return
 
-class TestImageClassification(unittest.TestCase):
+        # Directory for saving the trained model
+        save_dirname = os.path.join(
+            self.temp_dir.name,
+            "image_classification_" + net_type + ".inference.model")
+
+        train(net_type, use_cuda, save_dirname, is_local)
+        #infer(use_cuda, save_dirname)
 
     def test_amp_lists(self):
         white_list = copy.copy(
@@ -413,11 +421,11 @@ def test_amp_lists_7(self):
 
     def test_vgg_cuda(self):
         with self.scope_prog_guard():
-            main('vgg', use_cuda=True)
+            self.main('vgg', use_cuda=True)
 
     def test_resnet_cuda(self):
         with self.scope_prog_guard():
-            main('resnet', use_cuda=True)
+            self.main('resnet', use_cuda=True)
 
     @contextlib.contextmanager
     def scope_prog_guard(self):
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 668373838c0b0..71ba7f0c79ec9 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -25,6 +25,7 @@
 import sys
 import os
 import struct
+import tempfile
 
 paddle.enable_static()
 
@@ -192,11 +193,13 @@ def main(use_cuda, is_local=True, use_bf16=False, pure_bf16=False):
     if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
         return
 
+    temp_dir = tempfile.TemporaryDirectory()
     # Directory for saving the trained model
-    save_dirname = "fit_a_line.inference.model"
+    save_dirname = os.path.join(temp_dir.name, "fit_a_line.inference.model")
 
     train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16)
     infer(use_cuda, save_dirname, use_bf16)
+    temp_dir.cleanup()
 
 
 class TestFitALineBase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 7096a16d89faf..e2f78a0f36f7b 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -22,6 +22,7 @@
 import numpy
 import unittest
 import os
+import tempfile
 import numpy as np
 
 paddle.enable_static()
@@ -244,10 +245,13 @@ def main(net_type, use_cuda, is_local=True):
         return
 
     # Directory for saving the trained model
-    save_dirname = "image_classification_" + net_type + ".inference.model"
+    temp_dir = tempfile.TemporaryDirectory()
+    save_dirname = os.path.join(
+        temp_dir.name, "image_classification_" + net_type + ".inference.model")
 
     train(net_type, use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
+    temp_dir.cleanup()
 
 
 class TestImageClassification(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index eee1d7959eef7..cb962493e7ac8 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -20,6 +20,7 @@
 import os
 import time
 import unittest
+import tempfile
 
 import paddle
 import paddle.dataset.conll05 as conll05
@@ -354,12 +355,16 @@ def main(use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
+    temp_dir = tempfile.TemporaryDirectory()
     # Directory for saving the trained model
-    save_dirname = "label_semantic_roles.inference.model"
+    save_dirname = os.path.join(temp_dir.name,
+                                "label_semantic_roles.inference.model")
 
     train(use_cuda, save_dirname, is_local)
     infer(use_cuda, save_dirname)
 
+    temp_dir.cleanup()
+
 
 class TestLabelSemanticRoles(unittest.TestCase):
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 8a4b4c2683747..0a26a03eb878b 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -23,6 +23,7 @@
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as layers
 import paddle.fluid.nets as nets
+import tempfile
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
@@ -318,10 +319,13 @@ def main(use_cuda):
         return
 
     # Directory for saving the inference model
-    save_dirname = "recommender_system.inference.model"
+    temp_dir = tempfile.TemporaryDirectory()
+    save_dirname = os.path.join(temp_dir.name,
+                                "recommender_system.inference.model")
 
     train(use_cuda, save_dirname)
     infer(use_cuda, save_dirname)
+    temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 7a31035d2fb22..9499583c07bae 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -23,7 +23,9 @@
 import contextlib
 import math
 import sys
+import os
 import unittest
+import tempfile
 from paddle.fluid.executor import Executor
 import paddle
 
@@ -266,10 +268,13 @@ def main(use_cuda):
         return
 
     # Directory for saving the trained model
-    save_dirname = "rnn_encoder_decoder.inference.model"
+    temp_dir = tempfile.TemporaryDirectory()
+    save_dirname = os.path.join(temp_dir.name,
+                                "rnn_encoder_decoder.inference.model")
 
     train(use_cuda, save_dirname)
     infer(use_cuda, save_dirname)
+    temp_dir.cleanup()
 
 
 class TestRnnEncoderDecoder(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py
index 37d5106e8502d..9e79fd3f523f8 100644
--- a/python/paddle/fluid/tests/book/test_word2vec_book.py
+++ b/python/paddle/fluid/tests/book/test_word2vec_book.py
@@ -22,6 +22,7 @@
 import numpy as np
 import math
 import sys
+import tempfile
 
 paddle.enable_static()
 
@@ -247,7 +248,7 @@ def to_infer_tensor(lod_tensor):
         infer_inputs = [to_infer_tensor(t) for t in infer_inputs]
 
         infer_config = fluid.core.NativeConfig()
-        infer_config.model_dir = 'word2vec.inference.model'
+        infer_config.model_dir = save_dirname
         if target == "cuda":
             infer_config.use_gpu = True
             infer_config.device = 0
@@ -273,8 +274,9 @@ def main(target, is_sparse, is_parallel, use_bf16, pure_bf16):
     if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
         return
 
+    temp_dir = tempfile.TemporaryDirectory()
     if not is_parallel:
-        save_dirname = "word2vec.inference.model"
+        save_dirname = os.path.join(temp_dir.name, "word2vec.inference.model")
     else:
         save_dirname = None
 
@@ -290,6 +292,7 @@ def main(target, is_sparse, is_parallel, use_bf16, pure_bf16):
               use_bf16=use_bf16,
               pure_bf16=pure_bf16)
     infer(target, save_dirname)
+    temp_dir.cleanup()
 
 
 FULL_TEST = os.getenv('FULL_TEST',
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc
new file mode 100644
index 0000000000000..f0ea48ed93595
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/custom_phi_kernel.h"
+
+namespace paddle {
+
+namespace custom_kernel {
+
+// Here we use dot <CPU, ANY, INT8> for test
+// This test will fail when this kernel is supported in framework
+template <typename T>
+void DotKernel(const phi::Context& dev_ctx,
+               const phi::DenseTensor& x,
+               const phi::DenseTensor& y,
+               phi::DenseTensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  T* z = dev_ctx.template Alloc<T>(out);
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto&& d = x.dims();
+  auto const N = x.numel();
+  auto const B = d[d.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace custom_kernel
+}  // namespace paddle
+
+PD_BUILD_PHI_KERNEL(
+    dot, CPU, ALL_LAYOUT, paddle::custom_kernel::DotKernel, int8_t) {}
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
new file mode 100644
index 0000000000000..a94307161d431
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.fluid import core
+from distutils.sysconfig import get_python_lib
+from distutils.core import setup, Extension
+from setuptools.command.build_ext import build_ext
+
+
+# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
+# Avoid a gcc warning below:
+# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
+# for C/ObjC but not for C++
+class BuildExt(build_ext):
+
+    def build_extensions(self):
+        if '-Wstrict-prototypes' in self.compiler.compiler_so:
+            self.compiler.compiler_so.remove('-Wstrict-prototypes')
+        super(BuildExt, self).build_extensions()
+
+
+# cc flags
+paddle_extra_compile_args = [
+    '-std=c++14',
+    '-shared',
+    '-fPIC',
+    '-Wno-parentheses',
+    '-DPADDLE_WITH_CUSTOM_KERNEL',
+]
+if core.is_compiled_with_npu():
+    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
+# include path
+site_packages_path = get_python_lib()
+paddle_custom_kernel_include = [
+    os.path.join(site_packages_path, 'paddle', 'include'),
+]
+# include path third_party
+compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
+                                        'build/third_party')
+paddle_custom_kernel_include += [
+    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
+    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
+    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
+]
+
+# libs path
+paddle_custom_kernel_library_dir = [
+    os.path.join(site_packages_path, 'paddle', 'fluid'),
+]
+
+# libs
+libs = [':core_avx.so']
+if not core.has_avx_core and core.has_noavx_core:
+    libs = [':core_noavx.so']
+
+custom_kernel_dot_module = Extension(
+    'custom_kernel_dot',
+    sources=['custom_kernel_dot_c.cc'],
+    include_dirs=paddle_custom_kernel_include,
+    library_dirs=paddle_custom_kernel_library_dir,
+    libraries=libs,
+    extra_compile_args=paddle_extra_compile_args)
+
+setup(name='custom_kernel_dot_c',
+      version='1.0',
+      description='custom kernel fot compiling',
+      cmdclass={'build_ext': BuildExt},
+      ext_modules=[custom_kernel_dot_module])
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
index d1929fef5cc54..e28bfe00e7c4f 100644
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
@@ -56,6 +56,42 @@ def tearDown(self):
         del os.environ['CUSTOM_DEVICE_ROOT']
 
 
+class TestCustomKernelDotC(unittest.TestCase):
+
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # --inplace to place output so file to current dir
+        cmd = 'cd {} && {} custom_kernel_dot_c_setup.py build_ext --inplace'.format(
+            cur_dir, sys.executable)
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
+
+    def test_custom_kernel_dot_run(self):
+        # test dot run
+        x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
+
+        import paddle
+        paddle.set_device('cpu')
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.dot(x, y)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(), result),
+            "custom kernel dot out: {},\n numpy dot out: {}".format(
+                out.numpy(), result))
+
+    def tearDown(self):
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+
 if __name__ == '__main__':
     if os.name == 'nt' or sys.platform.startswith('darwin'):
         # only support Linux now
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index 78078963a7dea..ff0b11128a4f0 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -15,6 +15,7 @@
 import os
 import unittest
 import numpy as np
+import tempfile
 
 import paddle
 from paddle import nn
@@ -73,6 +74,9 @@ def forward(self, x):
 
 class TestDygraphModel(unittest.TestCase):
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def setUp(self):
 
         self.seed = 2021
@@ -93,8 +97,12 @@ def setUp(self):
         self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu']
 
         # for saving model
-        self.model_path_template = "infer_model/custom_relu_dygaph_model_{}.pdparams"
-        self.model_dy2stat_path = "infer_model/custom_relu_model_dy2sta"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'infer_model')
+        self.model_path_template = os.path.join(
+            self.model_save_dir, 'custom_relu_dygaph_model_{}.pdparams')
+        self.model_dy2stat_path = os.path.join(
+            self.model_save_dir, 'infer_model/custom_relu_model_dy2sta')
 
         # for dy2stat
         self.x_spec = paddle.static.InputSpec(shape=[None, self.in_dim],
@@ -210,12 +218,16 @@ def setUp(self):
         self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu']
 
         # for saving model
-        self.model_path_template = "infer_model/custom_relu_static_model_{}_{}"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'infer_model')
+        self.model_path_template = os.path.join(
+            self.model_save_dir, 'custom_relu_static_model_{}_{}')
 
         paddle.enable_static()
 
     def tearDown(self):
         paddle.disable_static()
+        self.temp_dir.cleanup()
 
     def test_train_eval(self):
         for device in self.devices:
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 29433b17153f5..1a53bf3354f36 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -18,6 +18,7 @@
 import unittest
 import paddle
 import paddle.static as static
+import tempfile
 import subprocess
 import numpy as np
 from paddle.vision.transforms import Compose, Normalize
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
index a7532ff3e7376..82b73609b2e11 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
@@ -30,43 +30,61 @@ def sample_program_configs(self):
 
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
-                return np.ones([32]).astype(np.float32)
+                return np.random.random([32]).astype(np.float32)
             elif dims == 2:
-                return np.ones([3, 32]).astype(np.float32)
+                return np.random.random([3, 32]).astype(np.float32)
             elif dims == 3:
-                return np.ones([3, 32, 32]).astype(np.float32)
+                return np.random.random([3, 32, 32]).astype(np.float32)
             else:
-                return np.ones([batch, 3, 32, 32]).astype(np.float32)
+                return np.random.random([batch, 3, 32, 32]).astype(np.float32)
 
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
-                for op_type in ["relu", "sigmoid", "tanh", "relu6"]:
-                    self.dims = dims
-                    dics = [{}]
-
-                    ops_config = [{
-                        "op_type": op_type,
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
-                    ops = self.generate_op_config(ops_config)
-
-                    program_config = ProgramConfig(
-                        ops=ops,
-                        weights={},
-                        inputs={
-                            "input_data":
-                            TensorConfig(data_gen=partial(
-                                generate_input1, dims, batch, dics))
-                        },
-                        outputs=["output_data"])
-
-                    yield program_config
+                for op_type in [
+                        "relu", "sigmoid", "tanh", "relu6", "elu", "selu",
+                        "softsign", "stanh", "thresholded_relu", "softplus"
+                ]:
+                    # few samples to reduce time
+                    #for beta in [-0.2, 0.5, 0.67, 3]:
+                    #    for alpha in [-0.2, 0.5, 0.67, 3]:
+                    for beta in [0.67]:
+                        for alpha in [0.67]:
+                            self.dims = dims
+                            dics = [{}]
+                            if op_type == "elu":
+                                dics = [{"alpha": alpha}]
+                            if op_type == "selu":
+                                dics = [{"alpha": beta, "scale": alpha}]
+                            if op_type == "stanh":
+                                dics = [{"scale_a": beta, "scale_b": alpha}]
+                            if op_type == "thresholded_relu":
+                                dics = [{"threshold": alpha}]
+                            if op_type == "softplus":
+                                dics = [{"beta": beta}]
+
+                            ops_config = [{
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["input_data"]
+                                },
+                                "op_outputs": {
+                                    "Out": ["output_data"]
+                                },
+                                "op_attrs": dics[0]
+                            }]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={},
+                                inputs={
+                                    "input_data":
+                                    TensorConfig(data_gen=partial(
+                                        generate_input1, dims, batch, dics))
+                                },
+                                outputs=["output_data"])
+
+                            yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py
new file mode 100644
index 0000000000000..3fe041db9333e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertBilinearInterpV2Test(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return np.ones([1, 3, 64, 64]).astype(np.float32)
+
+        def generate_input2(attrs: List[Dict[str, Any]]):
+            return np.random.uniform(low=0.5, high=6.0,
+                                     size=(2)).astype("float32")
+
+        for data_layout in ["NCHW", "NHWC"]:
+            for scale_y in [2.0, -1.0, 0.0]:
+                for scale_x in [2.0, -1.0, 0.0]:
+                    scale = [scale_y, scale_x]
+                    for out_h in [32, 64, 128, 192]:
+                        for out_w in [32, 64]:
+                            dics = [{
+                                "data_layout": data_layout,
+                                "interp_method": "bilinear",
+                                "align_corners": False,
+                                "align_mode": 0,
+                                "scale": scale,
+                                "out_h": out_h,
+                                "out_w": out_w
+                            }]
+
+                            ops_config = [{
+                                "op_type": "bilinear_interp_v2",
+                                "op_inputs": {
+                                    "X": ["input_data"],
+                                    "Scale": ["input_scale"]
+                                },
+                                "op_outputs": {
+                                    "Out": ["bilinear_interp_v2_output_data"]
+                                },
+                                "op_attrs": dics[0]
+                            }]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={
+                                    "input_scale":
+                                    TensorConfig(
+                                        data_gen=partial(generate_input2, dics))
+                                },
+                                inputs={
+                                    "input_data":
+                                    TensorConfig(
+                                        data_gen=partial(generate_input1, dics))
+                                },
+                                outputs=["bilinear_interp_v2_output_data"])
+
+                            yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 64, 64]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-2
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-2
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
index 9948b29321dc0..a53b61a00727b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
@@ -77,7 +77,9 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if dynamic_shape == True:
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[
+                    2] * 10 < 8000 and dynamic_shape == True:
                 return 0, 3
             else:
                 return 1, 2
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 83a50c2a4472d..bf33d5532014f 100755
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -17,8 +17,10 @@
 paddle.set_default_dtype("float64")
 from paddle.fluid.layers import sequence_mask
 
+import os
 import numpy as np
 import unittest
+import tempfile
 
 from convert import convert_params_for_net
 from rnn_numpy import SimpleRNN, LSTM, GRU
@@ -336,16 +338,18 @@ def forward(self, input):
 
     rnn = paddle.jit.to_static(
         rnn, [paddle.static.InputSpec(shape=[None, None, 16], dtype=x.dtype)])
-    paddle.jit.save(rnn, "./inference/%s_infer" % mode)
+    temp_dir = tempfile.TemporaryDirectory()
+    save_dirname = os.path.join(temp_dir.name, "./inference/%s_infer" % mode)
+
+    paddle.jit.save(rnn, save_dirname)
 
     paddle.enable_static()
 
     new_scope = paddle.static.Scope()
     with paddle.static.scope_guard(new_scope):
         exe = paddle.static.Executor(place)
-        [inference_program, feed_target_names, fetch_targets
-         ] = paddle.static.load_inference_model("./inference/%s_infer" % mode,
-                                                exe)
+        [inference_program, feed_target_names,
+         fetch_targets] = paddle.static.load_inference_model(save_dirname, exe)
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: x.numpy()},
                           fetch_list=fetch_targets)
@@ -353,6 +357,8 @@ def forward(self, input):
             y.numpy(), results[0])  # eval results equal predict results
     paddle.disable_static()
 
+    temp_dir.cleanup()
+
 
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index e31baf9fe2e70..86e23c79d07a4 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -172,8 +172,12 @@ def test_set_download_cmd(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        filename1 = "afs:test_in_memory_dataset_run_a.txt"
-        filename2 = "afs:test_in_memory_dataset_run_b.txt"
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "afs:test_in_memory_dataset_run_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "afs:test_in_memory_dataset_run_b.txt")
+
         with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -223,19 +227,24 @@ def test_set_download_cmd(self):
                 except Exception as e:
                     self.assertTrue(False)
 
-        os.remove(filename1)
-        os.remove(filename2)
+        temp_dir.cleanup()
 
     def test_in_memory_dataset_run(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_in_memory_dataset_run_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_run_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_run_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_in_memory_dataset_run_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -257,10 +266,7 @@ def test_in_memory_dataset_run(self):
                      pipe_command="cat",
                      use_var=slots_vars)
         dataset._init_distributed_settings(fea_eval=True, candidate_size=1)
-        dataset.set_filelist([
-            "test_in_memory_dataset_run_a.txt",
-            "test_in_memory_dataset_run_b.txt"
-        ])
+        dataset.set_filelist([filename1, filename2])
         dataset.load_into_memory()
         dataset.slots_shuffle(["slot1"])
         dataset.local_shuffle()
@@ -282,14 +288,19 @@ def test_in_memory_dataset_run(self):
                 except Exception as e:
                     self.assertTrue(False)
 
-        os.remove("./test_in_memory_dataset_run_a.txt")
-        os.remove("./test_in_memory_dataset_run_b.txt")
+        temp_dir.cleanup()
 
     def test_in_memory_dataset_masterpatch(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_in_memory_dataset_masterpatch_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_masterpatch_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_masterpatch_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 id1 1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 id1 1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 id2 1 1 1 1 1 0 1 0\n"
@@ -300,7 +311,7 @@ def test_in_memory_dataset_masterpatch(self):
             data += "1 id5 1 1 1 1 1 0 1 0\n"
             data += "1 id5 1 1 1 1 1 0 1 0\n"
             f.write(data)
-        with open("test_in_memory_dataset_masterpatch_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 id6 1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 id6 1 1 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 id6 1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -353,14 +364,19 @@ def test_in_memory_dataset_masterpatch(self):
         dataset.update_settings(merge_size=2)
         dataset.dataset.merge_by_lineid()
 
-        os.remove("./test_in_memory_dataset_masterpatch_a.txt")
-        os.remove("./test_in_memory_dataset_masterpatch_b.txt")
+        temp_dir.cleanup()
 
     def test_in_memory_dataset_masterpatch1(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_in_memory_dataset_masterpatch1_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_masterpatch1_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_masterpatch1_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 id1 1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 id1 1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 id2 1 1 1 1 1 0 1 0\n"
@@ -371,7 +387,7 @@ def test_in_memory_dataset_masterpatch1(self):
             data += "1 id5 1 1 1 1 1 0 1 0\n"
             data += "1 id5 1 1 1 1 1 0 1 0\n"
             f.write(data)
-        with open("test_in_memory_dataset_masterpatch1_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 id6 1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 id6 1 1 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 id6 1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -427,8 +443,7 @@ def test_in_memory_dataset_masterpatch1(self):
         dataset._set_merge_by_lineid(2)
         dataset.dataset.merge_by_lineid()
 
-        os.remove("./test_in_memory_dataset_masterpatch1_a.txt")
-        os.remove("./test_in_memory_dataset_masterpatch1_b.txt")
+        temp_dir.cleanup()
 
     def test_in_memory_dataset_run_2(self):
         """
@@ -436,12 +451,18 @@ def test_in_memory_dataset_run_2(self):
         Use CUDAPlace
         Use float type id
         """
-        with open("test_in_memory_dataset_run_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_run_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset_run_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_in_memory_dataset_run_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -462,10 +483,7 @@ def test_in_memory_dataset_run_2(self):
                      thread_num=3,
                      pipe_command="cat",
                      use_var=slots_vars)
-        dataset.set_filelist([
-            "test_in_memory_dataset_run_a.txt",
-            "test_in_memory_dataset_run_b.txt"
-        ])
+        dataset.set_filelist([filename1, filename2])
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -540,19 +558,22 @@ def test_in_memory_dataset_run_2(self):
         fleet_ptr.set_client2client_config(1, 1, 1)
         fleet_ptr.get_cache_threshold(0)
 
-        os.remove("./test_in_memory_dataset_run_a.txt")
-        os.remove("./test_in_memory_dataset_run_b.txt")
+        temp_dir.cleanup()
 
     def test_queue_dataset_run(self):
         """
         Testcase for QueueDataset from create to run.
         """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name, "test_queue_dataset_run_a.txt")
+        filename2 = os.path.join(temp_dir.name, "test_queue_dataset_run_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -573,8 +594,7 @@ def test_queue_dataset_run(self):
                      thread_num=3,
                      pipe_command="cat",
                      use_var=slots_vars)
-        dataset.set_filelist(
-            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+        dataset.set_filelist([filename1, filename2])
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -605,10 +625,7 @@ def test_queue_dataset_run(self):
         except Exception as e:
             self.assertTrue(False)
 
-        if os.path.exists("./test_queue_dataset_run_a.txt"):
-            os.remove("./test_queue_dataset_run_a.txt")
-        if os.path.exists("./test_queue_dataset_run_b.txt"):
-            os.remove("./test_queue_dataset_run_b.txt")
+        temp_dir.cleanup()
 
     def test_queue_dataset_run_2(self):
         """
@@ -616,12 +633,16 @@ def test_queue_dataset_run_2(self):
         Use CUDAPlace
         Use float type id
         """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name, "test_queue_dataset_run_a.txt")
+        filename2 = os.path.join(temp_dir.name, "test_queue_dataset_run_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -642,8 +663,7 @@ def test_queue_dataset_run_2(self):
                      thread_num=3,
                      pipe_command="cat",
                      use_var=slots_vars)
-        dataset.set_filelist(
-            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+        dataset.set_filelist([filename1, filename2])
 
         exe = fluid.Executor(fluid.CPUPlace(
         ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
@@ -662,10 +682,7 @@ def test_queue_dataset_run_2(self):
                 except Exception as e:
                     self.assertTrue(False)
 
-        if os.path.exists("./test_queue_dataset_run_a.txt"):
-            os.remove("./test_queue_dataset_run_a.txt")
-        if os.path.exists("./test_queue_dataset_run_b.txt"):
-            os.remove("./test_queue_dataset_run_b.txt")
+        temp_dir.cleanup()
 
     def test_queue_dataset_run_3(self):
         """
@@ -673,13 +690,17 @@ def test_queue_dataset_run_3(self):
         Use CUDAPlace
         Use float type id
         """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name, "test_queue_dataset_run_a.txt")
+        filename2 = os.path.join(temp_dir.name, "test_queue_dataset_run_b.txt")
+
+        with open(filename1, "w") as f:
             data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
             data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
             data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
             data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
             f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
             data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
             data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
@@ -701,8 +722,7 @@ def test_queue_dataset_run_3(self):
                      input_type=1,
                      pipe_command="cat",
                      use_var=slots_vars)
-        dataset.set_filelist(
-            ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+        dataset.set_filelist([filename1, filename2])
         dataset.load_into_memory()
 
         exe = fluid.Executor(fluid.CPUPlace(
@@ -722,10 +742,7 @@ def test_queue_dataset_run_3(self):
                 except Exception as e:
                     self.assertTrue(False)
 
-        if os.path.exists("./test_queue_dataset_run_a.txt"):
-            os.remove("./test_queue_dataset_run_a.txt")
-        if os.path.exists("./test_queue_dataset_run_b.txt"):
-            os.remove("./test_queue_dataset_run_b.txt")
+        temp_dir.cleanup()
 
 
 class TestDatasetWithDataLoader(TestDataset):
@@ -789,12 +806,18 @@ def setUp(self):
         """
         Test Dataset With Fetch Handler. TestCases.
         """
-        with open("test_queue_dataset_run_a.txt", "w") as f:
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.filename1 = os.path.join(self.temp_dir.name,
+                                      "test_queue_dataset_run_a.txt")
+        self.filename2 = os.path.join(self.temp_dir.name,
+                                      "test_queue_dataset_run_b.txt")
+
+        with open(self.filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_queue_dataset_run_b.txt", "w") as f:
+        with open(self.filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -805,15 +828,14 @@ def tearDown(self):
         """
         Test Dataset With Fetch Handler. TestCases.
         """
-        os.remove("./test_queue_dataset_run_a.txt")
-        os.remove("./test_queue_dataset_run_b.txt")
+        self.temp_dir.cleanup()
 
     def test_dataset_none(self):
         """
         Test Dataset With Fetch Handler. TestCases.
         """
         slots_vars, out = self.net()
-        files = ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]
+        files = [self.filename1, self.filename2]
         dataset = self.get_dataset(slots_vars, files)
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -835,7 +857,7 @@ def test_infer_from_dataset(self):
         Test Dataset With Fetch Handler. TestCases.
         """
         slots_vars, out = self.net()
-        files = ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]
+        files = [self.filename1, self.filename2]
         dataset = self.get_dataset(slots_vars, files)
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -853,7 +875,7 @@ def test_fetch_handler(self):
         Test Dataset With Fetch Handler. TestCases.
         """
         slots_vars, out = self.net()
-        files = ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]
+        files = [self.filename1, self.filename2]
         dataset = self.get_dataset(slots_vars, files)
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -888,15 +910,20 @@ def test_dataset_fleet(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset2_run_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset2_run_b.txt")
 
         self.skipTest("parameter server will add pslib UT later")
 
-        with open("test_in_memory_dataset2_run_a.txt", "w") as f:
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_in_memory_dataset2_run_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -939,27 +966,29 @@ def test_dataset_fleet(self):
                          thread_num=3,
                          pipe_command="cat",
                          use_var=slots_vars)
-            dataset.set_filelist([
-                "test_in_memory_dataset2_run_a.txt",
-                "test_in_memory_dataset2_run_b.txt"
-            ])
+            dataset.set_filelist([filename1, filename2])
             dataset.load_into_memory()
             fleet._opt_info = None
             fleet._fleet_ptr = None
 
-        os.remove("./test_in_memory_dataset2_run_a.txt")
-        os.remove("./test_in_memory_dataset2_run_b.txt")
+        temp_dir.cleanup()
 
     def test_dataset_fleet2(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset2_run2_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset2_run2_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_in_memory_dataset2_run2_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -1011,10 +1040,7 @@ def test_dataset_fleet2(self):
                          thread_num=3,
                          pipe_command="cat",
                          use_var=slots_vars)
-            dataset.set_filelist([
-                "test_in_memory_dataset2_run2_a.txt",
-                "test_in_memory_dataset2_run2_b.txt"
-            ])
+            dataset.set_filelist([filename1, filename2])
             dataset.load_into_memory()
             try:
                 dataset.global_shuffle(fleet)
@@ -1073,19 +1099,24 @@ def test_dataset_fleet2(self):
             except:
                 print("warning: catch expected error")
 
-        os.remove("./test_in_memory_dataset2_run2_a.txt")
-        os.remove("./test_in_memory_dataset2_run2_b.txt")
+        temp_dir.cleanup()
 
     def test_bosps_dataset_fleet2(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
+        temp_dir = tempfile.TemporaryDirectory()
+        filename1 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset2_run2_a.txt")
+        filename2 = os.path.join(temp_dir.name,
+                                 "test_in_memory_dataset2_run2_b.txt")
+
+        with open(filename1, "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_in_memory_dataset2_run2_b.txt", "w") as f:
+        with open(filename2, "w") as f:
             data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -1137,10 +1168,7 @@ def test_bosps_dataset_fleet2(self):
                          thread_num=3,
                          pipe_command="cat",
                          use_var=slots_vars)
-            dataset.set_filelist([
-                "test_in_memory_dataset2_run2_a.txt",
-                "test_in_memory_dataset2_run2_b.txt"
-            ])
+            dataset.set_filelist([filename1, filename2])
             dataset.load_into_memory()
             try:
                 dataset.global_shuffle(fleet)
@@ -1190,6 +1218,7 @@ def test_bosps_dataset_fleet2(self):
             #dataset.get_pv_data_size()
             dataset.get_memory_data_size()
             dataset.get_shuffle_data_size()
+        temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index 8d949bf51a7da..8c9be45707f29 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -18,6 +18,7 @@
 import six
 import os
 import unittest
+import tempfile
 from simple_nets import simple_fc_net_with_inputs
 
 BATCH_SIZE = 32
@@ -27,8 +28,6 @@
 IMAGE_SHAPE = [2, 3]
 LABEL_SHAPE = [1]
 
-ALL_WRITTEN_FILES = set()
-
 
 def get_place_string(p):
     if isinstance(p, (fluid.CPUPlace or fluid.CUDAPlace)):
@@ -42,13 +41,7 @@ def get_place_string(p):
         return 'CUDAPlace()'
 
 
-def remove_all_written_files():
-    for filename in ALL_WRITTEN_FILES:
-        os.remove(filename)
-
-
 def write_reader_data_to_file(filename, reader):
-    ALL_WRITTEN_FILES.add(filename)
     with open(filename, 'w') as fid:
         for instance_list in reader():
             for i, instance in enumerate(instance_list):
@@ -81,10 +74,10 @@ class DatasetLoaderTestBase(unittest.TestCase):
     def setUp(self):
         self.dataset_name = "QueueDataset"
         self.drop_last = False
+        self.temp_dir = tempfile.TemporaryDirectory()
 
     def tearDown(self):
-        return
-        remove_all_written_files()
+        self.temp_dir.cleanup()
 
     def build_network(self):
         main_prog = fluid.Program()
@@ -129,7 +122,8 @@ def check_batch_number(self, place, randomize_batch_num=False):
             random_delta_batch_size = np.zeros(shape=[file_num])
 
         for i in six.moves.range(file_num):
-            filename = 'dataset_test_{}.txt'.format(i)
+            filename = os.path.join(self.temp_dir.name,
+                                    'dataset_test_{}.txt'.format(i))
             filelist.append(filename)
             write_reader_data_to_file(
                 filename,
@@ -214,6 +208,7 @@ class QueueDatasetTestWithoutDropLast(DatasetLoaderTestBase):
     def setUp(self):
         self.dataset_name = "QueueDataset"
         self.drop_last = True
+        self.temp_dir = tempfile.TemporaryDirectory()
 
 
 class InMemoryDatasetTestWithoutDropLast(DatasetLoaderTestBase):
@@ -221,6 +216,7 @@ class InMemoryDatasetTestWithoutDropLast(DatasetLoaderTestBase):
     def setUp(self):
         self.dataset_name = "InMemoryDataset"
         self.drop_last = False
+        self.temp_dir = tempfile.TemporaryDirectory()
 
 
 class InMemoryDatasetTestWithDropLast(DatasetLoaderTestBase):
@@ -228,6 +224,7 @@ class InMemoryDatasetTestWithDropLast(DatasetLoaderTestBase):
     def setUp(self):
         self.dataset_name = "InMemoryDataset"
         self.drop_last = True
+        self.temp_dir = tempfile.TemporaryDirectory()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index d4dc21e7646d6..30a86d02f3142 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -19,19 +19,27 @@
 import subprocess
 import unittest
 import paddle
+import tempfile
 import paddle.fluid as fluid
 from paddle.fluid import core
 
 
 class TestGPUPackagePaddle(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDwon(self):
+        self.temp_dir.cleanup()
+
     def test_import_paddle(self):
         if core.is_compiled_with_cuda():
             if core.is_compiled_with_rocm():
                 os.environ['HIP_VISIBLE_DEVICES'] = ''
             else:
                 os.environ['CUDA_VISIBLE_DEVICES'] = ''
-            test_file = 'test_no_gpu_run_rand.py'
+            test_file = os.path.join(self.temp_dir.name,
+                                     'test_no_gpu_run_rand.py')
             with open(test_file, 'w') as wb:
                 cmd_test = """
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index e9266a4643292..7310d19a522ff 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -17,6 +17,9 @@
 import paddle.fluid as fluid
 import numpy as np
 import six
+import cv2
+import os
+import tempfile
 from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting
 import paddle.nn as nn
 from paddle.static import InputSpec
@@ -737,6 +740,12 @@ def func_isinstance():
 
 class TestPureFp16SaveLoad(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_save_dtype_exception(self):
 
         def func():
@@ -848,7 +857,7 @@ def train_resnet(self,
                     'opt': optimizer.state_dict(),
                     'scaler': scaler.state_dict()
                 }
-                path = 'model.pdparams'
+                path = os.path.join(self.temp_dir.name, 'model.pdparams')
                 paddle.save(obj, path)
                 # paddle.load
                 obj_load = paddle.load(path)
@@ -888,6 +897,12 @@ def func_isinstance():
 
 class TestPureFp16InferenceSaveLoad(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def inference_save_load(self):
         BATCH_SIZE = 16
         BATCH_NUM = 4
@@ -951,7 +966,7 @@ def train(layer, loader, loss_fn, opt):
         train(layer, loader, loss_fn, adam)
 
         # save
-        path = "example_model/linear"
+        path = os.path.join(self.temp_dir.name, 'example_model/linear')
         paddle.jit.save(layer,
                         path,
                         input_spec=[InputSpec(shape=[IMAGE_SIZE], name='x')])
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index 0143bdb53242c..99097aaf0048e 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -17,7 +17,7 @@
 import unittest
 import numpy as np
 import tempfile
-
+import os
 import paddle
 import paddle.profiler as profiler
 import paddle.profiler.utils as utils
@@ -28,13 +28,19 @@
 
 class TestProfiler(unittest.TestCase):
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_profiler(self):
 
         def my_trace_back(prof):
-            profiler.export_chrome_tracing('./test_profiler_chrometracing/')(
-                prof)
-            profiler.export_protobuf('./test_profiler_pb/')(prof)
+            path = os.path.join(self.temp_dir.name,
+                                './test_profiler_chrometracing')
+            profiler.export_chrome_tracing(path)(prof)
+            path = os.path.join(self.temp_dir.name, './test_profiler_pb')
+            profiler.export_protobuf(path)(prof)
 
+        self.temp_dir = tempfile.TemporaryDirectory()
         x_value = np.random.randn(2, 3, 3)
         x = paddle.to_tensor(x_value,
                              stop_gradient=False,
@@ -135,9 +141,10 @@ def my_sheduler1(num_step):
                 paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y)
                 prof.step()
 
-        prof.export(path='./test_profiler_pb.pb', format='pb')
+        path = os.path.join(self.temp_dir.name, './test_profiler_pb.pb')
+        prof.export(path=path, format='pb')
         prof.summary()
-        result = profiler.utils.load_profiler_result('./test_profiler_pb.pb')
+        result = profiler.utils.load_profiler_result(path)
         prof = None
         dataset = RandomDataset(10 * 4)
         simple_net = SimpleNet()
diff --git a/python/paddle/fluid/tests/unittests/test_ops_nms.py b/python/paddle/fluid/tests/unittests/test_ops_nms.py
index 54ea804cdbd9b..c775a47bd2472 100644
--- a/python/paddle/fluid/tests/unittests/test_ops_nms.py
+++ b/python/paddle/fluid/tests/unittests/test_ops_nms.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 import numpy as np
 import paddle
 from test_nms_op import nms
+import tempfile
 
 
 def _find(condition):
@@ -79,6 +81,11 @@ def setUp(self):
         self.devices = ['cpu']
         if paddle.is_compiled_with_cuda():
             self.devices.append('gpu')
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(self.temp_dir.name, './net')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_nms(self):
         for device in self.devices:
@@ -169,7 +176,6 @@ def fun(x):
                                                 categories, 10)
                     return out
 
-                path = "./net"
                 boxes = np.random.rand(64, 4).astype('float32')
                 boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
                 boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
@@ -177,14 +183,14 @@ def fun(x):
                 origin = fun(paddle.to_tensor(boxes))
                 paddle.jit.save(
                     fun,
-                    path,
+                    self.path,
                     input_spec=[
                         paddle.static.InputSpec(shape=[None, 4],
                                                 dtype='float32',
                                                 name='x')
                     ],
                 )
-                load_func = paddle.jit.load(path)
+                load_func = paddle.jit.load(self.path)
                 res = load_func(paddle.to_tensor(boxes))
                 self.assertTrue(
                     np.array_equal(origin, res),
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 0eec7633a2ec1..4f5cfba0c1ab3 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -18,6 +18,7 @@
 import os
 import tempfile
 import numpy as np
+import paddle
 import paddle.utils as utils
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
@@ -205,4 +206,5 @@ def test_errors(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
index 4b0be989efe48..ba44c78f2c74d 100644
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -16,6 +16,8 @@
 
 import unittest
 import numpy as np
+import tempfile
+import os
 import paddle
 import paddle.nn as nn
 import paddle.optimizer as opt
@@ -76,6 +78,9 @@ def train(layer, loader, loss_fn, opt):
 
 class TestTranslatedLayer(unittest.TestCase):
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def setUp(self):
         # enable dygraph mode
         place = paddle.CPUPlace()
@@ -100,11 +105,14 @@ def setUp(self):
                                            drop_last=True,
                                            num_workers=0)
 
+        self.temp_dir = tempfile.TemporaryDirectory()
+
         # train
         train(self.layer, self.loader, self.loss_fn, self.sgd)
 
         # save
-        self.model_path = "linear.example.model"
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       './linear.example.model')
         paddle.jit.save(self.layer, self.model_path)
 
     def test_inference_and_fine_tuning(self):
diff --git a/python/paddle/fluid/tests/unittests/test_triplet_margin_loss.py b/python/paddle/fluid/tests/unittests/test_triplet_margin_loss.py
new file mode 100644
index 0000000000000..745cb6a178032
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_triplet_margin_loss.py
@@ -0,0 +1,395 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+
+
+def call_TripletMarginLoss_layer(
+    input,
+    positive,
+    negative,
+    p=2,
+    margin=0.3,
+    swap=False,
+    eps=1e-6,
+    reduction='mean',
+):
+    triplet_margin_loss = paddle.nn.TripletMarginLoss(p=p,
+                                                      epsilon=eps,
+                                                      margin=margin,
+                                                      swap=swap,
+                                                      reduction=reduction)
+    res = triplet_margin_loss(
+        input=input,
+        positive=positive,
+        negative=negative,
+    )
+    return res
+
+
+def call_TripletMarginLoss_functional(
+    input,
+    positive,
+    negative,
+    p=2,
+    margin=0.3,
+    swap=False,
+    eps=1e-6,
+    reduction='mean',
+):
+    res = paddle.nn.functional.triplet_margin_loss(input=input,
+                                                   positive=positive,
+                                                   negative=negative,
+                                                   p=p,
+                                                   epsilon=eps,
+                                                   margin=margin,
+                                                   swap=swap,
+                                                   reduction=reduction)
+    return res
+
+
+def test_static(place,
+                input_np,
+                positive_np,
+                negative_np,
+                p=2,
+                margin=0.3,
+                swap=False,
+                eps=1e-6,
+                reduction='mean',
+                functional=False):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.static.data(name='input',
+                                   shape=input_np.shape,
+                                   dtype='float64')
+        positive = paddle.static.data(name='positive',
+                                      shape=positive_np.shape,
+                                      dtype='float64')
+        negative = paddle.static.data(name='negative',
+                                      shape=negative_np.shape,
+                                      dtype='float64')
+        feed_dict = {
+            "input": input_np,
+            "positive": positive_np,
+            "negative": negative_np
+        }
+
+        if functional:
+            res = call_TripletMarginLoss_functional(input=input,
+                                                    positive=positive,
+                                                    negative=negative,
+                                                    p=p,
+                                                    eps=eps,
+                                                    margin=margin,
+                                                    swap=swap,
+                                                    reduction=reduction)
+        else:
+            res = call_TripletMarginLoss_layer(input=input,
+                                               positive=positive,
+                                               negative=negative,
+                                               p=p,
+                                               eps=eps,
+                                               margin=margin,
+                                               swap=swap,
+                                               reduction=reduction)
+
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+    return static_result
+
+
+def test_dygraph(place,
+                 input,
+                 positive,
+                 negative,
+                 p=2,
+                 margin=0.3,
+                 swap=False,
+                 eps=1e-6,
+                 reduction='mean',
+                 functional=False):
+    paddle.disable_static()
+    input = paddle.to_tensor(input)
+    positive = paddle.to_tensor(positive)
+    negative = paddle.to_tensor(negative)
+
+    if functional:
+        dy_res = call_TripletMarginLoss_functional(input=input,
+                                                   positive=positive,
+                                                   negative=negative,
+                                                   p=p,
+                                                   eps=eps,
+                                                   margin=margin,
+                                                   swap=swap,
+                                                   reduction=reduction)
+    else:
+        dy_res = call_TripletMarginLoss_layer(input=input,
+                                              positive=positive,
+                                              negative=negative,
+                                              p=p,
+                                              eps=eps,
+                                              margin=margin,
+                                              swap=swap,
+                                              reduction=reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_triplet_margin_loss(
+    input,
+    positive,
+    negative,
+    p=2,
+    margin=0.3,
+    swap=False,
+    reduction='mean',
+):
+    positive_dist = np.linalg.norm((input - positive), p, axis=1)
+    negative_dist = np.linalg.norm((input - negative), p, axis=1)
+
+    if swap:
+        swap_dist = np.linalg.norm((positive - negative), p, axis=1)
+        negative_dist = np.minimum(negative_dist, swap_dist)
+    expected = np.maximum(positive_dist - negative_dist + margin, 0)
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
+class TestTripletMarginLoss(unittest.TestCase):
+
+    def test_TripletMarginLoss(self):
+        shape = (2, 2)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+
+        places = [paddle.CPUPlace()]
+        if paddle.device.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        for place in places:
+            for reduction in reductions:
+                expected = calc_triplet_margin_loss(input=input,
+                                                    positive=positive,
+                                                    negative=negative,
+                                                    reduction=reduction)
+
+                dy_result = test_dygraph(
+                    place=place,
+                    input=input,
+                    positive=positive,
+                    negative=negative,
+                    reduction=reduction,
+                )
+
+                static_result = test_static(
+                    place=place,
+                    input_np=input,
+                    positive_np=positive,
+                    negative_np=negative,
+                    reduction=reduction,
+                )
+                self.assertTrue(np.allclose(static_result, expected))
+                self.assertTrue(np.allclose(static_result, dy_result))
+                self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static(place=place,
+                                                input_np=input,
+                                                positive_np=positive,
+                                                negative_np=negative,
+                                                reduction=reduction,
+                                                functional=True)
+                dy_functional = test_dygraph(place=place,
+                                             input=input,
+                                             positive=positive,
+                                             negative=negative,
+                                             reduction=reduction,
+                                             functional=True)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_TripletMarginLoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(ValueError,
+                          paddle.nn.loss.TripletMarginLoss,
+                          reduction="unsupport reduction")
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        positive = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        negative = paddle.to_tensor([[0.2, 0.1]], dtype='float32')
+        self.assertRaises(ValueError,
+                          paddle.nn.functional.triplet_margin_loss,
+                          input=input,
+                          positive=positive,
+                          negative=negative,
+                          reduction="unsupport reduction")
+        paddle.enable_static()
+
+    def test_TripletMarginLoss_dimension(self):
+        paddle.disable_static()
+
+        input = paddle.to_tensor([[0.1, 0.3], [1, 2]], dtype='float32')
+        positive = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        negative = paddle.to_tensor([[0.2, 0.1]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.triplet_margin_loss,
+            input=input,
+            positive=positive,
+            negative=negative,
+        )
+        TMLoss = paddle.nn.loss.TripletMarginLoss()
+        self.assertRaises(
+            ValueError,
+            TMLoss,
+            input=input,
+            positive=positive,
+            negative=negative,
+        )
+        paddle.enable_static()
+
+    def test_TripletMarginLoss_swap(self):
+        reduction = 'mean'
+        place = paddle.CPUPlace()
+        shape = (2, 2)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        expected = calc_triplet_margin_loss(input=input,
+                                            swap=True,
+                                            positive=positive,
+                                            negative=negative,
+                                            reduction=reduction)
+
+        dy_result = test_dygraph(
+            place=place,
+            swap=True,
+            input=input,
+            positive=positive,
+            negative=negative,
+            reduction=reduction,
+        )
+
+        static_result = test_static(
+            place=place,
+            swap=True,
+            input_np=input,
+            positive_np=positive,
+            negative_np=negative,
+            reduction=reduction,
+        )
+        self.assertTrue(np.allclose(static_result, expected))
+        self.assertTrue(np.allclose(static_result, dy_result))
+        self.assertTrue(np.allclose(dy_result, expected))
+        static_functional = test_static(place=place,
+                                        swap=True,
+                                        input_np=input,
+                                        positive_np=positive,
+                                        negative_np=negative,
+                                        reduction=reduction,
+                                        functional=True)
+        dy_functional = test_dygraph(place=place,
+                                     swap=True,
+                                     input=input,
+                                     positive=positive,
+                                     negative=negative,
+                                     reduction=reduction,
+                                     functional=True)
+        self.assertTrue(np.allclose(static_functional, expected))
+        self.assertTrue(np.allclose(static_functional, dy_functional))
+        self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_TripletMarginLoss_margin(self):
+        paddle.disable_static()
+
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        positive = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        negative = paddle.to_tensor([[0.2, 0.1]], dtype='float32')
+        margin = -0.5
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.triplet_margin_loss,
+            margin=margin,
+            input=input,
+            positive=positive,
+            negative=negative,
+        )
+        paddle.enable_static()
+
+    def test_TripletMarginLoss_p(self):
+        p = 3
+        shape = (2, 2)
+        reduction = 'mean'
+        place = paddle.CPUPlace()
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        expected = calc_triplet_margin_loss(input=input,
+                                            p=p,
+                                            positive=positive,
+                                            negative=negative,
+                                            reduction=reduction)
+
+        dy_result = test_dygraph(
+            place=place,
+            p=p,
+            input=input,
+            positive=positive,
+            negative=negative,
+            reduction=reduction,
+        )
+
+        static_result = test_static(
+            place=place,
+            p=p,
+            input_np=input,
+            positive_np=positive,
+            negative_np=negative,
+            reduction=reduction,
+        )
+        self.assertTrue(np.allclose(static_result, expected))
+        self.assertTrue(np.allclose(static_result, dy_result))
+        self.assertTrue(np.allclose(dy_result, expected))
+        static_functional = test_static(place=place,
+                                        p=p,
+                                        input_np=input,
+                                        positive_np=positive,
+                                        negative_np=negative,
+                                        reduction=reduction,
+                                        functional=True)
+        dy_functional = test_dygraph(place=place,
+                                     p=p,
+                                     input=input,
+                                     positive=positive,
+                                     negative=negative,
+                                     reduction=reduction,
+                                     functional=True)
+        self.assertTrue(np.allclose(static_functional, expected))
+        self.assertTrue(np.allclose(static_functional, dy_functional))
+        self.assertTrue(np.allclose(dy_functional, expected))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c78c89964c92e..b5662f9ecf4f9 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -934,89 +934,91 @@ class Model(object):
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
-        inputs (InputSpec|list|tuple|dict|None): `inputs`, entry points of network,
+        inputs (InputSpec|list|tuple|dict|None, optional): `inputs`, entry points of network,
             could be a InputSpec instance, or list/tuple of InputSpec instances,
             or dict ({name: InputSpec}), and it couldn't be None in static
-            graph.
-        labels (InputSpec|list|tuple|None): `labels`, entry points of network,
+            graph. Default: None.
+        labels (InputSpec|list|tuple|None, optional): `labels`, entry points of network,
             could be a InputSpec instnace or list/tuple of InputSpec instances,
             or None. For static graph, if labels is required in loss,
-            labels must be set. Otherwise, it could be None.
+            labels must be set. Otherwise, it could be None. Default: None.
 
 
     Examples:
         1. A common example
 
         .. code-block:: python
+          :name: code-example1
 
-          import paddle
-          import paddle.nn as nn
-          import paddle.vision.transforms as T
-          from paddle.static import InputSpec
-  
-          device = paddle.set_device('cpu') # or 'gpu'
-
-          net = nn.Sequential(
-              nn.Flatten(1),
-              nn.Linear(784, 200),
-              nn.Tanh(),
-              nn.Linear(200, 10))
-  
-          # inputs and labels are not required for dynamic graph.
-          input = InputSpec([None, 784], 'float32', 'x')
-          label = InputSpec([None, 1], 'int64', 'label')
-          
-          model = paddle.Model(net, input, label)
-          optim = paddle.optimizer.SGD(learning_rate=1e-3,
-              parameters=model.parameters())
-
-          model.prepare(optim,
+            import paddle
+            import paddle.nn as nn
+            import paddle.vision.transforms as T
+            from paddle.static import InputSpec
+
+            device = paddle.set_device('cpu') # or 'gpu'
+
+            net = nn.Sequential(
+                nn.Flatten(1),
+                nn.Linear(784, 200),
+                nn.Tanh(),
+                nn.Linear(200, 10))
+
+            # inputs and labels are not required for dynamic graph.
+            input = InputSpec([None, 784], 'float32', 'x')
+            label = InputSpec([None, 1], 'int64', 'label')
+            
+            model = paddle.Model(net, input, label)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                parameters=model.parameters())
+
+            model.prepare(optim,
                         paddle.nn.CrossEntropyLoss(),
                         paddle.metric.Accuracy())
-          
-          transform = T.Compose([
-              T.Transpose(),
-              T.Normalize([127.5], [127.5])
-          ])
-          data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-          model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+            transform = T.Compose([
+                T.Transpose(),
+                T.Normalize([127.5], [127.5])
+            ])
+            data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+            model.fit(data, epochs=2, batch_size=32, verbose=1)
 
 
         2. An example using mixed precision training.
 
         .. code-block:: python
-        
-          # required: gpu
-          import paddle
-          import paddle.nn as nn
-          import paddle.vision.transforms as T
+          :name: code-example2
 
-          def run_example_code():
-            device = paddle.set_device('gpu')
+            # required: gpu
+            import paddle
+            import paddle.nn as nn
+            import paddle.vision.transforms as T
 
-            net = nn.Sequential(nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(),
-                                nn.Linear(200, 10))
+            def run_example_code():
+                device = paddle.set_device('gpu')
 
-            model = paddle.Model(net)
-            optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters())
+                net = nn.Sequential(nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(),
+                                    nn.Linear(200, 10))
 
-            amp_configs = {
-                "level": "O1",
-                "custom_white_list": {'conv2d'},
-                "use_dynamic_loss_scaling": True
-            }
-            model.prepare(optim,
-                paddle.nn.CrossEntropyLoss(),
-                paddle.metric.Accuracy(),
-                amp_configs=amp_configs)
+                model = paddle.Model(net)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters())
 
-            transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
-            data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-            model.fit(data, epochs=2, batch_size=32, verbose=1)
+                amp_configs = {
+                    "level": "O1",
+                    "custom_white_list": {'conv2d'},
+                    "use_dynamic_loss_scaling": True
+                }
+                model.prepare(optim,
+                    paddle.nn.CrossEntropyLoss(),
+                    paddle.metric.Accuracy(),
+                    amp_configs=amp_configs)
 
-          # mixed precision training is only supported on GPU now.
-          if paddle.is_compiled_with_cuda():
-            run_example_code()
+                transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
+                data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+                model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+            # mixed precision training is only supported on GPU now.
+            if paddle.is_compiled_with_cuda():
+                run_example_code()
 
     """
 
@@ -1059,12 +1061,12 @@ def train_batch(self, inputs, labels=None, update=True):
             inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
                 be a numpy array or paddle.Tensor, or a list of arrays or 
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
                 a numpy array or paddle.Tensor, or a list of arrays or tensors 
                 (in case the model has multiple labels). If has no labels, 
-                set None. Default is None.
-            update (bool): Whether update parameters after loss.backward() computing.
-                Using it to accumulate gradients. Default is True.
+                set None. Default: None.
+            update (bool, optional): Whether update parameters after loss.backward() computing.
+                Set it to False to accumulate gradients. Default: True.
 
         Returns:
             A list of scalar training loss if the model has no metrics,
@@ -1074,29 +1076,30 @@ def train_batch(self, inputs, labels=None, update=True):
         Examples:
 
             .. code-block:: python
+              :name: code-example-train-batch
             
-              import numpy as np
-              import paddle
-              import paddle.nn as nn
-              from paddle.static import InputSpec
-
-              device = paddle.set_device('cpu') # or 'gpu'
-
-              net = nn.Sequential(
-                  nn.Linear(784, 200),
-                  nn.Tanh(),
-                  nn.Linear(200, 10))
-
-              input = InputSpec([None, 784], 'float32', 'x')
-              label = InputSpec([None, 1], 'int64', 'label')
-              model = paddle.Model(net, input, label)
-              optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                  parameters=model.parameters())
-              model.prepare(optim, paddle.nn.CrossEntropyLoss())
-              data = np.random.random(size=(4,784)).astype(np.float32)
-              label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
-              loss = model.train_batch([data], [label])
-              print(loss)
+                import paddle
+                import paddle.nn as nn
+                from paddle.static import InputSpec
+
+                device = paddle.set_device('cpu') # or 'gpu'
+
+                net = nn.Sequential(
+                    nn.Linear(784, 200),
+                    nn.Tanh(),
+                    nn.Linear(200, 10))
+
+                input = InputSpec([None, 784], 'float32', 'x')
+                label = InputSpec([None, 1], 'int64', 'label')
+                model = paddle.Model(net, input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameters=model.parameters())
+                model.prepare(optim, paddle.nn.CrossEntropyLoss())
+                data = paddle.rand((4, 784), dtype="float32")
+                label = paddle.randint(0, 10, (4, 1), dtype="int64")
+                loss = model.train_batch([data], [label])
+                print(loss)
+                # [array([2.192784], dtype=float32)]
         """
         loss = self._adapter.train_batch(inputs, labels, update)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1112,10 +1115,10 @@ def eval_batch(self, inputs, labels=None):
             inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
                 be a numpy array or paddle.Tensor, or a list of arrays or 
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
                 a numpy array or paddle.Tensor, or a list of arrays or tensors 
                 (in case the model has multiple labels). If has no labels, 
-                set None. Default is None.
+                set None. Default: None.
 
         Returns:
             A list of scalar testing loss if the model has no metrics,
@@ -1125,30 +1128,31 @@ def eval_batch(self, inputs, labels=None):
         Examples:
 
             .. code-block:: python
-            
-              import numpy as np
-              import paddle
-              import paddle.nn as nn
-              from paddle.static import InputSpec
-
-              device = paddle.set_device('cpu') # or 'gpu'
-
-              net = nn.Sequential(
-                  nn.Linear(784, 200),
-                  nn.Tanh(),
-                  nn.Linear(200, 10))
-
-              input = InputSpec([None, 784], 'float32', 'x')
-              label = InputSpec([None, 1], 'int64', 'label')
-              model = paddle.Model(net, input, label)
-              optim = paddle.optimizer.SGD(learning_rate=1e-3,
-                  parameters=model.parameters())
-              model.prepare(optim,
-                            paddle.nn.CrossEntropyLoss())
-              data = np.random.random(size=(4,784)).astype(np.float32)
-              label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
-              loss = model.eval_batch([data], [label])
-              print(loss)
+              :name: code-example-eval-batch
+
+                import paddle
+                import paddle.nn as nn
+                from paddle.static import InputSpec
+
+                device = paddle.set_device('cpu') # or 'gpu'
+
+                net = nn.Sequential(
+                    nn.Linear(784, 200),
+                    nn.Tanh(),
+                    nn.Linear(200, 10))
+
+                input = InputSpec([None, 784], 'float32', 'x')
+                label = InputSpec([None, 1], 'int64', 'label')
+                model = paddle.Model(net, input, label)
+                optim = paddle.optimizer.SGD(learning_rate=1e-3,
+                    parameters=model.parameters())
+                model.prepare(optim,
+                            paddle.nn.CrossEntropyLoss(), metrics=paddle.metric.Accuracy())
+                data = paddle.rand((4, 784), dtype="float32")
+                label = paddle.randint(0, 10, (4, 1), dtype="int64")
+                loss, acc = model.eval_batch([data], [label])
+                print(loss, acc)
+                # [array([2.8825705], dtype=float32)] [0.0]
         """
         loss = self._adapter.eval_batch(inputs, labels)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1172,28 +1176,31 @@ def predict_batch(self, inputs):
         Examples:
 
             .. code-block:: python
-            
-              import numpy as np
-              import paddle
-              import paddle.nn as nn
-              from paddle.static import InputSpec
-
-              device = paddle.set_device('cpu') # or 'gpu'
-              
-              input = InputSpec([None, 784], 'float32', 'x')
-              label = InputSpec([None, 1], 'int64', 'label')
-
-              net = nn.Sequential(
-                  nn.Linear(784, 200),
-                  nn.Tanh(),
-                  nn.Linear(200, 10),
-                  nn.Softmax())
-
-              model = paddle.Model(net, input, label)
-              model.prepare()
-              data = np.random.random(size=(4,784)).astype(np.float32)
-              out = model.predict_batch([data])
-              print(out)
+              :name: code-example-predict-batch
+
+                import paddle
+                import paddle.nn as nn
+                from paddle.static import InputSpec
+
+                device = paddle.set_device('cpu') # or 'gpu'
+                
+                input = InputSpec([None, 784], 'float32', 'x')
+                label = InputSpec([None, 1], 'int64', 'label')
+
+                net = nn.Sequential(
+                    nn.Linear(784, 200),
+                    nn.Tanh(),
+                    nn.Linear(200, 10),
+                    nn.Softmax())
+
+                model = paddle.Model(net, input, label)
+                model.prepare()
+                data = paddle.rand((1, 784), dtype="float32")
+                out = model.predict_batch([data])
+                print(out)
+                # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
+                #          0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
+                #          dtype=float32)]
         """
         loss = self._adapter.predict_batch(inputs)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1229,6 +1236,7 @@ def save(self, path, training=True):
         Examples:
 
             .. code-block:: python
+              :name: code-example-save
 
                 import paddle
                 import paddle.nn as nn
@@ -1259,7 +1267,7 @@ def forward(self, x):
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
                     parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                
+
                 transform = T.Compose([
                     T.Transpose(),
                     T.Normalize([127.5], [127.5])
@@ -1294,14 +1302,14 @@ def load(self, path, skip_mismatch=False, reset_optimizer=False):
                 optimizer states. The files would be `path.pdparams` and
                 `path.pdopt` separately, and the latter is not necessary
                 when no need to restore.
-            skip_mismatch (bool): Whether to skip the loading of mismatch
+            skip_mismatch (bool, optional): Whether to skip the loading of mismatch
                 parameter or raise an error when mismatch happens (not found
                 the parameter in file storing model states of or receives a
-                mismatch shape).
-            reset_optimizer (bool): If True, ignore the providing file storing
+                mismatch shape). Default: False.
+            reset_optimizer (bool, optional): If True, ignore the providing file storing
                 optimizer states and initialize optimizer states from scratch.
                 Otherwise, restore optimizer states from `path.pdopt` if
-                a optimizer has been set to the model. Default False.
+                a optimizer has been set to the model. Default: False.
 
         Returns:
             None
@@ -1309,23 +1317,24 @@ def load(self, path, skip_mismatch=False, reset_optimizer=False):
         Examples:
 
             .. code-block:: python
-            
-              import paddle
-              import paddle.nn as nn
-              from paddle.static import InputSpec
+              :name: code-example-load
+
+                import paddle
+                import paddle.nn as nn
+                from paddle.static import InputSpec
 
-              device = paddle.set_device('cpu')
+                device = paddle.set_device('cpu')
 
-              input = InputSpec([None, 784], 'float32', 'x')
+                input = InputSpec([None, 784], 'float32', 'x')
 
-              model = paddle.Model(nn.Sequential(
-                  nn.Linear(784, 200),
-                  nn.Tanh(),
-                  nn.Linear(200, 10),
-                  nn.Softmax()), input)
+                model = paddle.Model(nn.Sequential(
+                    nn.Linear(784, 200),
+                    nn.Tanh(),
+                    nn.Linear(200, 10),
+                    nn.Softmax()), input)
 
-              model.save('checkpoint/test')
-              model.load('checkpoint/test')
+                model.save('checkpoint/test')
+                model.load('checkpoint/test')
         """
 
         def _load_state_from_path(path):
@@ -1395,19 +1404,20 @@ def parameters(self, *args, **kwargs):
         Examples:
 
             .. code-block:: python
+              :name: code-example-parameters
+            
+                import paddle
+                import paddle.nn as nn
+                from paddle.static import InputSpec
 
-              import paddle
-              import paddle.nn as nn
-              from paddle.static import InputSpec
-
-              input = InputSpec([None, 784], 'float32', 'x')
-              
-              model = paddle.Model(nn.Sequential(
-                  nn.Linear(784, 200),
-                  nn.Tanh(),
-                  nn.Linear(200, 10)), input)
+                input = InputSpec([None, 784], 'float32', 'x')
+                
+                model = paddle.Model(nn.Sequential(
+                    nn.Linear(784, 200),
+                    nn.Tanh(),
+                    nn.Linear(200, 10)), input)
 
-              params = model.parameters()
+                params = model.parameters()
         """
         return self._adapter.parameters()
 
@@ -1501,16 +1511,16 @@ def prepare(self,
         Configures the model before runing.
 
         Args:
-            optimizer (Optimizer|None): Optimizer must be set in training
+            optimizer (Optimizer|None, optional): Optimizer must be set in training
                 and should be a Optimizer instance. It can be None in eval
-                and test mode.
-            loss (Loss|callable function|None): Loss function can
+                and test mode. Default: None.
+            loss (Loss|Callable|None, optional): Loss function can
                 be a `paddle.nn.Layer` instance or any callable function
                 taken the predicted values and ground truth values as input.
-                It can be None when there is no loss.
-            metrics (Metric|list of Metric|None): If metrics is set, all
-                metrics will be calculated and output in train/eval mode.
-            amp_configs (str|dict|None): AMP configurations. If AMP or pure
+                It can be None when there is no loss. Default: None.
+            metrics (Metric|list[Metric]|None, optional): If metrics is set, all
+                metrics will be calculated and output in train/eval mode. Default: None.
+            amp_configs (str|dict|None, optional): AMP configurations. If AMP or pure
                 float16 training is used, the key 'level' of 'amp_configs'
                 should be set to 'O1' or 'O2' respectively. Otherwise, the
                 value of 'level' defaults to 'O0', which means float32
@@ -1526,6 +1536,7 @@ def prepare(self,
                 for details. For convenience, 'amp_configs' could be set to
                 'O1' or 'O2' if no more parameters are needed. 'amp_configs'
                 could be None in float32 training. Default: None.
+
         Returns:
             None
         """
@@ -1587,133 +1598,133 @@ def fit(self,
         evaluation will be done at the end of each epoch.
 
         Args:
-            train_data (Dataset|DataLoader): An iterable data loader is used for 
+            train_data (Dataset|DataLoader, optional): An iterable data loader is used for 
                 train. An instance of paddle paddle.io.Dataset or 
                 paddle.io.Dataloader is recomended. Default: None.
-            eval_data (Dataset|DataLoader): An iterable data loader is used for
+            eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
                 evaluation at the end of epoch. If None, will not do evaluation. 
                 An instance of paddle.io.Dataset or paddle.io.Dataloader 
                 is recomended. Default: None.
-            batch_size (int): Integer number. The batch size of train_data
-                and eval_data. When train_data and eval_data are both the
-                instance of Dataloader, this parameter will be ignored.
-                Default: 1.
-            epochs (int): Integer number. The number of epochs to train
-                the model. Default: 1.
-            eval_freq (int): The frequency, in number of epochs, an evalutation
+            batch_size (int, optional): The batch size of train_data and eval_data. When 
+                train_data and eval_data are both the instance of Dataloader, this
+                parameter will be ignored. Default: 1.
+            epochs (int, optional): The number of epochs to train the model. Default: 1.
+            eval_freq (int, optional): The frequency, in number of epochs, an evalutation
                 is performed. Default: 1.
-            log_freq (int): The frequency, in number of steps, the training logs
+            log_freq (int, optional): The frequency, in number of steps, the training logs
                 are printed. Default: 10.
-            save_dir(str|None): The directory to save checkpoint during training.
+            save_dir(str|None, optional): The directory to save checkpoint during training.
                 If None, will not save checkpoint. Default: None.
-            save_freq (int): The frequency, in number of epochs, to save
+            save_freq (int, optional): The frequency, in number of epochs, to save
                 checkpoint. Default: 1.
-            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
+            verbose (int, optional): The verbosity mode, should be 0, 1, or 2. 0 = silent,
                 1 = progress bar, 2 = one line per epoch. Default: 2.
-            drop_last (bool): Whether drop the last incomplete batch of
+            drop_last (bool, optional): Whether drop the last incomplete batch of
                 train_data when dataset size is not divisible by the batch size.
                 When train_data is an instance of Dataloader, this parameter
                 will be ignored. Default: False.
-            shuffle (bool): Whther to shuffle train_data. When train_data is
+            shuffle (bool, optional): Whther to shuffle train_data. When train_data is
                 an instance of Dataloader, this parameter will be ignored.
                 Default: True.
-            num_workers (int): The number of subprocess to load data, 0 for no
+            num_workers (int, optional): The number of subprocess to load data, 0 for no
                 subprocess used and loading data in main process.
                 When train_data and eval_data are both the instance of
                 Dataloader, this parameter will be ignored. Default: 0.
-            callbacks (Callback|None): A list of `Callback` instances to apply
-                during training. If None, `ProgBarLogger` and `ModelCheckpoint`
-                are automatically inserted. Default: None.
-            accumulate_grad_batches (int): The number of batches to accumulate gradident 
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
+                :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
+            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident 
                 during training process before optimizer updates. It can mimic large batch
                 size. Default: 1.
-            num_iters (int|None): Integer number. The number of iterations to train
-                the model. If None, follow `epochs` to train the model, otherwise, train
-                the model `num_iters` times. Default: None.
-            
+            num_iters (int|None, optional): The number of iterations to evaluate the model.
+                If None, evaluate on whole input dataset, otherwise, evaluate `num_iters` times.
+                Default: None.
+
         Returns:
             None
 
         Examples:
-            1. An example use Dataset and set btch size, shuffle in fit.
+            1. An example use Dataset and set batch size, shuffle in fit.
                How to make a batch is done internally.
 
             .. code-block:: python
+              :name: code-example-fit-1
 
-              import paddle
-              import paddle.vision.transforms as T
-              from paddle.vision.datasets import MNIST
-              from paddle.static import InputSpec
-
-              dynamic = True
-              if not dynamic:
-                  paddle.enable_static()
-
-              transform = T.Compose([
-                  T.Transpose(),
-                  T.Normalize([127.5], [127.5])
-              ])
-              train_dataset = MNIST(mode='train', transform=transform)
-              val_dataset = MNIST(mode='test', transform=transform)
-           
-              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
-              label = InputSpec([None, 1], 'int64', 'label')
-           
-              model = paddle.Model(
-                  paddle.vision.models.LeNet(),
-                  input, label)
-              optim = paddle.optimizer.Adam(
-                  learning_rate=0.001, parameters=model.parameters())
-              model.prepare(
-                  optim,
-                  paddle.nn.CrossEntropyLoss(),
-                  paddle.metric.Accuracy(topk=(1, 2)))
-              model.fit(train_dataset,
-                        val_dataset,
-                        epochs=2,
-                        batch_size=64,
-                        save_dir='mnist_checkpoint')
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.vision.datasets import MNIST
+                from paddle.static import InputSpec
+
+                dynamic = True
+                if not dynamic:
+                    paddle.enable_static()
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+                val_dataset = MNIST(mode='test', transform=transform)
+
+                input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+                label = InputSpec([None, 1], 'int64', 'label')
+
+                model = paddle.Model(
+                    paddle.vision.models.LeNet(),
+                    input, label)
+                optim = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                model.prepare(
+                    optim,
+                    paddle.nn.CrossEntropyLoss(),
+                    paddle.metric.Accuracy(topk=(1, 2)))
+                model.fit(train_dataset,
+                            val_dataset,
+                            epochs=2,
+                            batch_size=64,
+                            save_dir='mnist_checkpoint')
 
             2. An example use DataLoader, batch size and shuffle is set in
                DataLoader.
 
             .. code-block:: python
+              :name: code-example-fit-2
+
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.vision.datasets import MNIST
+                from paddle.static import InputSpec
 
-              import paddle
-              import paddle.vision.transforms as T
-              from paddle.vision.datasets import MNIST
-              from paddle.static import InputSpec
+                dynamic = True
+                if not dynamic:
+                    paddle.enable_static()
+                
+                transform = T.Compose([
+                        T.Transpose(),
+                        T.Normalize([127.5], [127.5])
+                    ])
+                train_dataset = MNIST(mode='train', transform=transform)
+                train_loader = paddle.io.DataLoader(train_dataset,
+                    batch_size=64)
+                val_dataset = MNIST(mode='test', transform=transform)
+                val_loader = paddle.io.DataLoader(val_dataset,
+                    batch_size=64)
+
+                input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+                label = InputSpec([None, 1], 'int64', 'label')
 
-              dynamic = True
-              if not dynamic:
-                  paddle.enable_static()
-              
-              transform = T.Compose([
-                    T.Transpose(),
-                    T.Normalize([127.5], [127.5])
-                ])
-              train_dataset = MNIST(mode='train', transform=transform)
-              train_loader = paddle.io.DataLoader(train_dataset,
-                  batch_size=64)
-              val_dataset = MNIST(mode='test', transform=transform)
-              val_loader = paddle.io.DataLoader(val_dataset,
-                  batch_size=64)
-           
-              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
-              label = InputSpec([None, 1], 'int64', 'label')
-           
-              model = paddle.Model(
-                  paddle.vision.models.LeNet(), input, label)
-              optim = paddle.optimizer.Adam(
-                  learning_rate=0.001, parameters=model.parameters())
-              model.prepare(
-                  optim,
-                  paddle.nn.CrossEntropyLoss(),
-                  paddle.metric.Accuracy(topk=(1, 2)))
-              model.fit(train_loader,
-                        val_loader,
-                        epochs=2,
-                        save_dir='mnist_checkpoint')
+                model = paddle.Model(
+                    paddle.vision.models.LeNet(), input, label)
+                optim = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                model.prepare(
+                    optim,
+                    paddle.nn.CrossEntropyLoss(),
+                    paddle.metric.Accuracy(topk=(1, 2)))
+                model.fit(train_loader,
+                            val_loader,
+                            epochs=2,
+                            save_dir='mnist_checkpoint')
         """
         assert train_data is not None, \
                 "train_data must be given!"
@@ -1809,23 +1820,23 @@ def evaluate(self,
             eval_data (Dataset|DataLoader): An iterable data loader is used for
                 evaluation. An instance of paddle.io.Dataset or 
                 paddle.io.Dataloader is recomended.
-            batch_size (int): Integer number. The batch size of train_data
-                and eval_data.  When eval_data is the instance of Dataloader,
-                this argument will be ignored. Default: 1.
-            log_freq (int): The frequency, in number of steps, the eval logs
+            batch_size (int, optional): The batch size of train_data and eval_data.
+                When eval_data is the instance of Dataloader, this argument will be
+                ignored. Default: 1.
+            log_freq (int, optional): The frequency, in number of steps, the eval logs
                 are printed. Default: 10.
-            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
+            verbose (int, optional): The verbosity mode, should be 0, 1, or 2. 0 = silent,
                 1 = progress bar, 2 = one line per epoch. Default: 2.
-            num_workers (int): The number of subprocess to load data,
+            num_workers (int, optional): The number of subprocess to load data,
                 0 for no subprocess used and loading data in main process. When
                 train_data and eval_data are both the instance of Dataloader,
                 this parameter will be ignored. Default: 0.
-            callbacks (Callback|None): A list of `Callback` instances to apply
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
                 during training. If None, `ProgBarLogger` and `ModelCheckpoint`
                 are automatically inserted. Default: None.
-            num_iters (int|None): Integer number. The number of iterations to
-                evaluate the model. If None, evaluate on whole input dataset,
-                otherwise, evaluate `num_iters` times. Default: None.
+            num_iters (int|None, optional): The number of iterations to evaluate the model.
+                If None, evaluate on whole input dataset, otherwise, evaluate `num_iters` times.
+                Default: None.
         Returns:
             dict: Result of metric. The key is the names of Metric,
                 value is a scalar or numpy.array.
@@ -1833,24 +1844,26 @@ def evaluate(self,
         Examples:
 
           .. code-block:: python
+            :name: code-example-evaluate
 
-            import paddle
-            import paddle.vision.transforms as T
-            from paddle.static import InputSpec
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.static import InputSpec
 
-            # declarative mode
-            transform = T.Compose([
-                    T.Transpose(),
-                    T.Normalize([127.5], [127.5])
-                ])
-            val_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
+                # declarative mode
+                transform = T.Compose([
+                        T.Transpose(),
+                        T.Normalize([127.5], [127.5])
+                    ])
+                val_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
 
-            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
-            label = InputSpec([None, 1], 'int64', 'label')
-            model = paddle.Model(paddle.vision.models.LeNet(), input, label)
-            model.prepare(metrics=paddle.metric.Accuracy())
-            result = model.evaluate(val_dataset, batch_size=64)
-            print(result)
+                input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+                label = InputSpec([None, 1], 'int64', 'label')
+                model = paddle.Model(paddle.vision.models.LeNet(), input, label)
+                model.prepare(metrics=paddle.metric.Accuracy())
+                result = model.evaluate(val_dataset, batch_size=64)
+                print(result)
+                # {'acc': 0.0699}
         """
 
         if eval_data is not None and isinstance(eval_data, Dataset):
@@ -1912,21 +1925,20 @@ def predict(self,
             test_data (Dataset|DataLoader): An iterable data loader is used for
                 predict. An instance of paddle.io.Dataset or paddle.io.Dataloader
                 is recomended.
-            batch_size (int): Integer number. The batch size of train_data and eval_data.
-                When train_data and eval_data are both the instance of Dataloader, this
-                argument will be ignored. Default: 1.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
-                used and loading data in main process. When train_data and eval_data are
-                both the instance of Dataloader, this argument will be ignored. Default: 0.
-            stack_outputs (bool): Whether stack output field like a batch, as for an output
-                filed of a sample is in shape [X, Y], test_data contains N samples, predict
+            batch_size (int, optional): The batch size of test_data. When test_data is the
+                instance of Dataloader, this argument will be ignored. Default: 1.
+            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess 
+                used and loading data in main process. When test_data is the instance of Dataloader,
+                this argument will be ignored. Default: 0.
+            stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
+                field of a sample is in shape [X, Y], test_data contains N samples, predict
                 output field will be in shape [N, X, Y] if stack_output is True, and will
-                be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
+                be a length N list in shape [[X, Y], [X, Y], ..., [X, Y]] if stack_outputs
                 is False. stack_outputs as False is used for LoDTensor output situation,
                 it is recommended set as True if outputs contains no LoDTensor. Default: False.
-            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
+            verbose (int, optional): The verbosity mode, should be 0, 1, or 2. 0 = silent,
                 1 = progress bar, 2 = one line per batch. Default: 1.
-            callbacks(Callback): A Callback instance, default None.
+            callbacks(Callback, optional): A Callback instance, Default: None.
 
         Returns:
             list: output of models.
@@ -1934,43 +1946,46 @@ def predict(self,
         Examples:
 
           .. code-block:: python
+            :name: code-example-predict
 
-            import numpy as np
-            import paddle
-            from paddle.static import InputSpec
+                import numpy as np
+                import paddle
+                from paddle.static import InputSpec
 
-            class MnistDataset(paddle.vision.datasets.MNIST):
-                def __init__(self, mode, return_label=True):
-                    super(MnistDataset, self).__init__(mode=mode)
-                    self.return_label = return_label
-
-                def __getitem__(self, idx):
-                    img = np.reshape(self.images[idx], [1, 28, 28])
-                    if self.return_label:
-                        return img, np.array(self.labels[idx]).astype('int64')
-                    return img,
-
-                def __len__(self):
-                    return len(self.images)
-
-            test_dataset = MnistDataset(mode='test', return_label=False)
-
-            # imperative mode
-            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
-            model = paddle.Model(paddle.vision.models.LeNet(), input)
-            model.prepare()
-            result = model.predict(test_dataset, batch_size=64)
-            print(len(result[0]), result[0][0].shape)
-
-            # declarative mode
-            device = paddle.set_device('cpu')
-            paddle.enable_static()
-            input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
-            model = paddle.Model(paddle.vision.models.LeNet(), input)
-            model.prepare()
-
-            result = model.predict(test_dataset, batch_size=64)
-            print(len(result[0]), result[0][0].shape)
+                class MnistDataset(paddle.vision.datasets.MNIST):
+                    def __init__(self, mode, return_label=True):
+                        super(MnistDataset, self).__init__(mode=mode)
+                        self.return_label = return_label
+
+                    def __getitem__(self, idx):
+                        img = np.reshape(self.images[idx], [1, 28, 28])
+                        if self.return_label:
+                            return img, np.array(self.labels[idx]).astype('int64')
+                        return img,
+
+                    def __len__(self):
+                        return len(self.images)
+
+                test_dataset = MnistDataset(mode='test', return_label=False)
+
+                # imperative mode
+                input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+                model = paddle.Model(paddle.vision.models.LeNet(), input)
+                model.prepare()
+                result = model.predict(test_dataset, batch_size=64)
+                print(len(result[0]), result[0][0].shape)
+                # 157 (64, 10)
+
+                # declarative mode
+                device = paddle.set_device('cpu')
+                paddle.enable_static()
+                input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
+                model = paddle.Model(paddle.vision.models.LeNet(), input)
+                model.prepare()
+
+                result = model.predict(test_dataset, batch_size=64)
+                print(len(result[0]), result[0][0].shape)
+                # 157 (64, 10)
         """
 
         if test_data is not None and isinstance(test_data, Dataset):
@@ -2164,23 +2179,25 @@ def summary(self, input_size=None, dtype=None):
 
         Examples:
             .. code-block:: python
+              :name: code-example-summary
+
+                import paddle
+                from paddle.static import InputSpec
+
+                input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+                label = InputSpec([None, 1], 'int64', 'label')
 
-              import paddle
-              from paddle.static import InputSpec
-           
-              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
-              label = InputSpec([None, 1], 'int64', 'label')
-           
-              model = paddle.Model(paddle.vision.models.LeNet(),
-                  input, label)
-              optim = paddle.optimizer.Adam(
-                  learning_rate=0.001, parameters=model.parameters())
-              model.prepare(
-                  optim,
-                  paddle.nn.CrossEntropyLoss())
-
-              params_info = model.summary()
-              print(params_info)
+                model = paddle.Model(paddle.vision.models.LeNet(),
+                    input, label)
+                optim = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                model.prepare(
+                    optim,
+                    paddle.nn.CrossEntropyLoss())
+
+                params_info = model.summary()
+                print(params_info)
+                # {'total_params': 61610, 'trainable_params': 61610}
 
         """
         assert (input_size is not None or self._inputs
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index a1e02dab4707d..8b29659a1f400 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -109,6 +109,7 @@
 from .layer.loss import HingeEmbeddingLoss  # noqa: F401
 from .layer.loss import CosineEmbeddingLoss  # noqa: F401
 from .layer.loss import TripletMarginWithDistanceLoss
+from .layer.loss import TripletMarginLoss
 from .layer.norm import BatchNorm  # noqa: F401
 from .layer.norm import SyncBatchNorm  # noqa: F401
 from .layer.norm import GroupNorm  # noqa: F401
@@ -316,4 +317,5 @@ def weight_norm(*args):
     'CosineEmbeddingLoss',
     'RReLU',
     'TripletMarginWithDistanceLoss',
+    'TripletMarginLoss',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 43ce403ab0b23..cdb1135eba800 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -92,6 +92,7 @@
 from .loss import hinge_embedding_loss  # noqa: F401
 from .loss import cosine_embedding_loss  # noqa: F401
 from .loss import triplet_margin_with_distance_loss
+from .loss import triplet_margin_loss
 from .norm import batch_norm  # noqa: F401
 from .norm import instance_norm  # noqa: F401
 from .norm import layer_norm  # noqa: F401
@@ -234,4 +235,5 @@
     'cosine_embedding_loss',
     'rrelu',
     'triplet_margin_with_distance_loss',
+    'triplet_margin_loss',
 ]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c882ab08296ae..2f37f8a50f4d1 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -28,7 +28,7 @@
 from paddle.utils import deprecated
 from paddle import _C_ops
 from paddle import in_dynamic_mode
-from paddle.framework import core
+from paddle.framework import core, _non_static_mode
 from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
 
 __all__ = []
@@ -2999,3 +2999,124 @@ def triplet_margin_with_distance_loss(input,
         return paddle.sum(loss, name=name)
     elif reduction == 'none':
         return loss
+
+
+def triplet_margin_loss(input,
+                        positive,
+                        negative,
+                        margin=1.0,
+                        p=2,
+                        epsilon=1e-6,
+                        swap=False,
+                        reduction='mean',
+                        name=None):
+    r"""
+        Measures the triplet loss given an input
+        tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
+        This is used for measuring a relative similarity between samples. A triplet
+        is composed by `input`, `positive` and `negative` (i.e., `input`, `positive examples` and `negative
+        examples` respectively). The shapes of all input tensors should be
+        :math:`(N, *)`.
+
+        The loss function for each sample in the mini-batch is:
+
+        .. math::
+            L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
+
+
+        where
+
+        .. math::
+            d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64.
+            the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+
+        positive (Tensor): Positive tensor, the data type is float32 or float64.
+            The shape of label is the same as the shape of input.
+
+        negative (Tensor): Negative tensor, the data type is float32 or float64.
+            The shape of label is the same as the shape of input.
+
+        margin (float, Optional): Default: :math:`1`.
+
+        p (int, Optional): The norm degree for pairwise distance. Default: :math:`2`.
+
+        epsilon (float, Optional): Add small value to avoid division by zero,
+            default value is 1e-6.
+
+        swap (bool,Optional): The distance swap change the negative distance to the distance between
+            positive sample and negative sample. For more details, see `Learning shallow convolutional feature descriptors with triplet losses`.
+            Default: ``False``.
+
+
+        reduction (str, Optional):Indicate how to average the loss by batch_size.
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default: ``'mean'``
+
+        name (str, Optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Output: Tensor. The tensor variable storing the triplet_margin_loss of input and positive and negative.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
+            positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
+            negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
+            loss = F.triplet_margin_loss(input, positive, negative, margin=1.0, reduction='none')
+            print(loss)
+            # Tensor([0.        , 0.57496738, 0.        ])
+
+
+            loss = F.triplet_margin_loss(input, positive, negative, margin=1.0, reduction='mean')
+            print(loss)
+            # Tensor([0.19165580])
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "'reduction' in 'triplet_margin_loss' should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction))
+    if margin < 0:
+        raise ValueError(
+            "The margin between positive samples and negative samples should be greater than 0."
+        )
+    if not _non_static_mode():
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'triplet_margin_loss')
+        check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
+                                 'triplet_margin_loss')
+        check_variable_and_dtype(negative, 'negative', ['float32', 'float64'],
+                                 'triplet_margin_loss')
+
+    if not (input.shape == positive.shape == negative.shape):
+        raise ValueError("input's shape must equal to "
+                         "positive's shape and  "
+                         "negative's shape")
+
+    distance_function = paddle.nn.PairwiseDistance(p, epsilon=epsilon)
+    positive_dist = distance_function(input, positive)
+    negative_dist = distance_function(input, negative)
+
+    if swap:
+        swap_dist = distance_function(positive, negative)
+        negative_dist = paddle.minimum(negative_dist, swap_dist)
+
+    loss = paddle.clip(positive_dist - negative_dist + margin, min=0.0)
+
+    if reduction == 'mean':
+        return paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(loss, name=name)
+    elif reduction == 'none':
+        return loss
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index a8e3d8ec1d464..e9ccee1bd3829 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -80,6 +80,7 @@
 from .loss import SmoothL1Loss  # noqa: F401
 from .loss import HingeEmbeddingLoss  # noqa: F401
 from .loss import TripletMarginWithDistanceLoss
+from .loss import TripletMarginLoss
 from .norm import BatchNorm1D  # noqa: F401
 from .norm import BatchNorm2D  # noqa: F401
 from .norm import BatchNorm3D  # noqa: F401
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 9b796d6965c33..1e72548ecc138 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1507,3 +1507,109 @@ def forward(self, input, positive, negative):
                                                    swap=self.swap,
                                                    reduction=self.reduction,
                                                    name=self.name)
+
+
+class TripletMarginLoss(Layer):
+    r"""
+    Creates a criterion that measures the triplet loss given an input
+    tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `input`, `positive` and `negative` (i.e., `input`, `positive examples` and `negative
+    examples` respectively). The shapes of all input tensors should be
+    :math:`(N, *)`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
+
+
+    where
+
+    .. math::
+        d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
+
+    Parameters:
+        margin (float, Optional):Default: :math:`1`.
+
+        p (int, Optional):The norm degree for pairwise distance. Default: :math:`2`.
+
+        epsilon (float, Optional):Add small value to avoid division by zero,
+            default value is 1e-6.
+
+        swap (bool, Optional):The distance swap change the negative distance to the distance between
+            positive sample and negative sample. For more details, see `Learning shallow convolutional feature descriptors with triplet losses`.
+            Default: ``False``.
+
+        reduction (str, Optional):Indicate how to average the loss by batch_size.
+                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+                If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+                If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+                Default: ``'mean'``
+
+        name (str,Optional): Name for the operation (optional, default is None).
+                For more information, please refer to :ref:`api_guide_Name`.
+
+    Call Parameters:
+        input (Tensor):Input tensor, the data type is float32 or float64.
+        the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+
+        positive (Tensor):Positive tensor, the data type is float32 or float64.
+        The shape of label is the same as the shape of input.
+
+        negative (Tensor):Negative tensor, the data type is float32 or float64.
+        The shape of label is the same as the shape of input.
+
+    Returns:
+        Tensor. The tensor variable storing the triplet_margin_loss of input and positive and negative.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
+            positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
+            negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
+            triplet_margin_loss = paddle.nn.TripletMarginLoss(reduction='none')
+            loss = triplet_margin_loss(input, positive, negative)
+            print(loss)
+            # Tensor([0.        , 0.57496738, 0.        ])
+	    
+            triplet_margin_loss = paddle.nn.TripletMarginLoss(margin=1.0, swap=True, reduction='mean', )
+            loss = triplet_margin_loss(input, positive, negative,)
+            print(loss)
+            # Tensor([0.19165580])
+
+    """
+
+    def __init__(self,
+                 margin=1.0,
+                 p=2.,
+                 epsilon=1e-6,
+                 swap=False,
+                 reduction='mean',
+                 name=None):
+        super(TripletMarginLoss, self).__init__()
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in TripletMarginLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+        self.margin = margin
+        self.p = p
+        self.epsilon = epsilon
+        self.swap = swap
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input, positive, negative):
+        return F.triplet_margin_loss(input,
+                                     positive,
+                                     negative,
+                                     margin=self.margin,
+                                     p=self.p,
+                                     epsilon=self.epsilon,
+                                     swap=self.swap,
+                                     reduction=self.reduction,
+                                     name=self.name)
diff --git a/python/paddle/tests/test_hapi_amp.py b/python/paddle/tests/test_hapi_amp.py
index eaf10dbfc4c75..24df22ab5ea54 100644
--- a/python/paddle/tests/test_hapi_amp.py
+++ b/python/paddle/tests/test_hapi_amp.py
@@ -20,7 +20,7 @@
 os.environ['FLAGS_cudnn_deterministic'] = '1'
 
 import unittest
-
+import tempfile
 import numpy as np
 
 import paddle
@@ -101,7 +101,9 @@ def test_save_load(self):
                   batch_size=64,
                   num_iters=2,
                   log_freq=1)
-        model.save('./lenet_amp')
+        temp_dir = tempfile.TemporaryDirectory()
+        lenet_amp_path = os.path.join(temp_dir.name, './lenet_amp')
+        model.save(lenet_amp_path)
 
         with paddle.fluid.unique_name.guard():
             paddle.seed(2021)
@@ -119,7 +121,8 @@ def test_save_load(self):
                model._scaler.state_dict()['incr_count']))
 
         # equal after load
-        new_model.load('./lenet_amp')
+        new_model.load(lenet_amp_path)
+        temp_dir.cleanup()
         self.assertEqual(new_model._scaler.state_dict()['incr_count'],
                          model._scaler.state_dict()['incr_count'])
         self.assertEqual(new_model._scaler.state_dict()['decr_count'],
diff --git a/python/paddle/tests/test_read_file.py b/python/paddle/tests/test_read_file.py
index 0dad971a7308e..dc5c8fafcd8f6 100644
--- a/python/paddle/tests/test_read_file.py
+++ b/python/paddle/tests/test_read_file.py
@@ -16,6 +16,7 @@
 import cv2
 import shutil
 import unittest
+import tempfile
 import numpy as np
 
 import paddle
@@ -26,23 +27,25 @@ class TestReadFile(unittest.TestCase):
 
     def setUp(self):
         fake_img = (np.random.random((400, 300, 3)) * 255).astype('uint8')
-        cv2.imwrite('fake.jpg', fake_img)
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.img_path = os.path.join(self.temp_dir.name, 'fake.jpg')
+        cv2.imwrite(self.img_path, fake_img)
 
     def tearDown(self):
-        os.remove('fake.jpg')
+        self.temp_dir.cleanup()
 
     def read_file_decode_jpeg(self):
         if not paddle.is_compiled_with_cuda():
             return
 
-        img_bytes = read_file('fake.jpg')
+        img_bytes = read_file(self.img_path)
 
         img = decode_jpeg(img_bytes, mode='gray')
         img = decode_jpeg(img_bytes, mode='rgb')
 
         img = decode_jpeg(img_bytes)
 
-        img_cv2 = cv2.imread('fake.jpg')
+        img_cv2 = cv2.imread(self.img_path)
         if paddle.in_dynamic_mode():
             np.testing.assert_equal(img.shape, img_cv2.transpose(2, 0, 1).shape)
         else:
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 35a0f8edc4843..c46ab2eaf5f57 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -926,7 +926,8 @@ def test_image_load(self):
         fake_img = Image.fromarray((np.random.random(
             (32, 32, 3)) * 255).astype('uint8'))
 
-        path = 'temp.jpg'
+        temp_dir = tempfile.TemporaryDirectory()
+        path = os.path.join(temp_dir.name, 'temp.jpg')
         fake_img.save(path)
 
         set_image_backend('pil')
@@ -939,7 +940,7 @@ def test_image_load(self):
 
         np_img = image_load(path)
 
-        os.remove(path)
+        temp_dir.cleanup()
 
     def test_affine(self):
         np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index ba58fe7f57d50..b1263f62dca73 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -181,13 +181,16 @@ class ResNet(nn.Layer):
 
     Args:
         Block (BasicBlock|BottleneckBlock): block module of model.
-        depth (int, optional): layers of resnet, Default: 50.
+        depth (int, optional): layers of ResNet, Default: 50.
         width (int, optional): base width per convolution group for each convolution block, Default: 64.
         num_classes (int, optional): output dim of last fc layer. If num_classes <=0, last fc layer
                             will not be defined. Default: 1000.
         with_pool (bool, optional): use pool before the last fc layer or not. Default: True.
         groups (int, optional): number of groups for each convolution block, Default: 1.
 
+    Returns:
+        ResNet model. An instance of :ref:`api_fluid_dygraph_Layer`.
+
     Examples:
         .. code-block:: python
 
@@ -330,7 +333,11 @@ def resnet18(pretrained=False, **kwargs):
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNet 18-layer model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -358,7 +365,11 @@ def resnet34(pretrained=False, **kwargs):
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNet 34-layer model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -386,7 +397,11 @@ def resnet50(pretrained=False, **kwargs):
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNet 50-layer model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -414,7 +429,11 @@ def resnet101(pretrained=False, **kwargs):
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNet 101-layer. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -442,7 +461,11 @@ def resnet152(pretrained=False, **kwargs):
     `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNet 152-layer model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -470,7 +493,11 @@ def resnext50_32x4d(pretrained=False, **kwargs):
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
     
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNeXt-50 32x4d model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -500,7 +527,11 @@ def resnext50_64x4d(pretrained=False, **kwargs):
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
     
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNeXt-50 64x4d model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -530,7 +561,11 @@ def resnext101_32x4d(pretrained=False, **kwargs):
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
     
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNeXt-101 32x4d model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -561,7 +596,11 @@ def resnext101_64x4d(pretrained=False, **kwargs):
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
     
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNeXt-101 64x4d model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -592,7 +631,11 @@ def resnext152_32x4d(pretrained=False, **kwargs):
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
     
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNeXt-152 32x4d model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -623,7 +666,11 @@ def resnext152_64x4d(pretrained=False, **kwargs):
     `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
     
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        ResNeXt-152 64x4d model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -654,7 +701,11 @@ def wide_resnet50_2(pretrained=False, **kwargs):
     `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        Wide ResNet-50-2 model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
@@ -683,7 +734,11 @@ def wide_resnet101_2(pretrained=False, **kwargs):
     `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
 
     Args:
-        pretrained (bool, optional): If True, returns a model pre-trained on ImageNet. Default: False.
+        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
+                            on ImageNet. Default: False.
+
+    Returns:
+        Wide ResNet-101-2 model. An instance of :ref:`api_fluid_dygraph_Layer`.
 
     Examples:
         .. code-block:: python
diff --git a/python/setup.py.in b/python/setup.py.in
index bb6416038f198..8b6a456865176 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -614,6 +614,8 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # phi core headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/infermeta', recursive=True)) +  # phi infermeta headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) +  # phi kernels headers
+    # capi headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) +  # phi capi headers
     # utila api headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers
 
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 8e84eccc083f2..17cfb5923cf97 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -266,7 +266,7 @@ function check_CHANGE_OP_MAP {
   done
   if [ $exit_code -ne 0 ]; then
     LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details."
-    LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR."
+    LOG "[INFO] Or you can apply for one RD (ZzSean(Recommend), JamesLim-sy, Xreki, luotao1) approval to pass this PR."
     exit ${exit_code}
   fi
 }
@@ -305,11 +305,11 @@ function gpu_op_benchmark {
 
 
 # The PR will pass quickly when get approval from specific person.
-# Xreki 12538138, luotao1 6836917, ZzSean 32410583
+# Xreki 12538138, luotao1 6836917, ZzSean 32410583, JamesLim-sy 61349199
 set +x
 approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
-  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
+  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917 61349199)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
   if [ "${APPROVALS}" == "TRUE" ]; then
     LOG "[INFO] ==================================="
diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh
index bf70d8bc3a495..605ed72a21642 100644
--- a/tools/test_ci_op_benchmark.sh
+++ b/tools/test_ci_op_benchmark.sh
@@ -273,7 +273,7 @@ function check_CHANGE_OP_MAP {
   done
   if [ $exit_code -ne 0 ]; then
     LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details."
-    LOG "[INFO] Or you can apply for one RD (ZzSean(Recommend), Xreki, luotao1) approval to pass this PR."
+    LOG "[INFO] Or you can apply for one RD (ZzSean(Recommend), JamesLim-sy, Xreki, luotao1) approval to pass this PR."
     exit $exit_code
   fi
 }
@@ -317,11 +317,11 @@ function gpu_op_benchmark {
 }
 
 # The PR will pass quickly when get approval from specific person.
-# Xreki 12538138, luotao1 6836917, ZzSean 32410583
+# Xreki 12538138, luotao1 6836917, ZzSean 32410583, JamesLim-sy 61349199
 set +x
 approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
-  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
+  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917 61349199)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
   if [ "${APPROVALS}" == "TRUE" ]; then
     LOG "[INFO] ==================================="