From 35ca1ce4ba8229f397030921edbcdeedf0327b00 Mon Sep 17 00:00:00 2001
From: helen88 <z8hanghuan@126.com>
Date: Wed, 27 Jul 2022 09:51:39 +0800
Subject: [PATCH 01/28] fix bug of elementwise_add_grad, *test=kunlun (#44545)

* fix bug of elementwise_add_grad, *test=kunlun

* fix bug, *test=kunlun

* rm pooling_t, *test=kunlun

* fix bug of ew_add_grad when inplace, *test=kunlun
---
 cmake/external/xpu.cmake                        |  4 ++--
 .../elementwise/elementwise_add_op_xpu.cc       |  7 ++-----
 paddle/fluid/operators/pool_op_xpu.cc           | 17 -----------------
 3 files changed, 4 insertions(+), 24 deletions(-)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index ad4471071e1fc..8021d2a6f80c6 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
       "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220719")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220722")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,7 +19,7 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   set(XPU_XDNN_BASE_URL_WITHOUT_DATE
       "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220719")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220722")
 else()
   set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
index 74dca5b57481f..7b2c72d081262 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc
@@ -1,11 +1,8 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -74,7 +71,7 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel<T> {
         int ret =
             xpu::reduce_sum<XPUType>(dev_ctx.x_context(),
                                      reinterpret_cast<const XPUType*>(dz_data),
-                                     reinterpret_cast<XPUType*>(dx_data),
+                                     reinterpret_cast<XPUType*>(dx->data<T>()),
                                      dz_vector,
                                      reduce_dims);
         PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum");
diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc
index 591559001309a..bfa638e5bde5d 100644
--- a/paddle/fluid/operators/pool_op_xpu.cc
+++ b/paddle/fluid/operators/pool_op_xpu.cc
@@ -21,23 +21,6 @@ namespace operators {
 
 using framework::Tensor;
 
-xpu::Pooling_t XPUPoolingType(const std::string& pooltype,
-                              bool exclusive,
-                              bool is_test) {
-  if (pooltype == "max") {
-    return xpu::Pooling_t::MAX_WITHOUT_INDEX;
-  } else if (pooltype == "avg") {
-    if (exclusive) {
-      return xpu::Pooling_t::AVG_WITHOUT_PAD;
-    } else {
-      return xpu::Pooling_t::AVG_WITH_PAD;
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Pool op only supports 2D and 3D input."));
-  }
-}
-
 template <typename DeviceContext, typename T>
 class PoolXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;

From 42d58ddd4992d2296aca9f54ff067602cc4d8734 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 27 Jul 2022 09:56:29 +0800
Subject: [PATCH 02/28] [IPU] small bug fix (#44473)

* sync misc changes

* add authors

Co-authored-by: Zhaorui Chen <zhaoruic@graphcore.ai>

* up x

* Revert "up x"

This reverts commit f3fde458c6cc48613269a643cfe2acf689caccd3.

* add guarg for ipu

Co-authored-by: Zhaorui Chen <zhaoruic@graphcore.ai>
---
 .../fluid/platform/device/ipu/ipu_compiler.cc |  2 +-
 .../fluid/platform/device/ipu/ipu_strategy.h  |  8 +--
 .../ipu/popart_canonicalization/loss_ops.cc   |  6 +-
 .../ipu/popart_canonicalization/other_ops.cc  | 33 +++++++++
 .../device/ipu/supported_ops_autogen.h        | 17 ++++-
 .../device/ipu/supported_ops_custom.h         |  1 -
 paddle/fluid/pybind/tensor.cc                 |  5 ++
 python/paddle/fluid/executor.py               |  6 +-
 .../tests/unittests/ipu/test_print_op_ipu.py  | 71 ++++++++++++++++++-
 9 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index a113bbbe26579..2bebdd51280ce 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -300,7 +300,7 @@ void Compiler::RegisterOpFunc() {
 #define INT32 std::int32_t
 #define BOOL bool
 #define STRING std::string
-#define STRING_VEC std::vector<std::string*>
+#define STRING_VEC std::vector<std::string>
 #define NONE
 
 #define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 997bc310df308..34beb44686b6d 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -159,8 +159,8 @@ class IpuStrategy {
       const std::string &type_str) {
     auto it = options.find(key);
     PADDLE_ENFORCE_NE(
-        it,
-        options.end(),
+        it == options.end(),
+        true,
         platform::errors::InvalidArgument("Cannot find option: %s, type: %s "
                                           "when setting IpuStrategy options",
                                           key,
@@ -174,8 +174,8 @@ class IpuStrategy {
       std::map<std::string, std::function<ValueType()>> &options) {  // NOLINT
     auto it = options.find(key);
     PADDLE_ENFORCE_NE(
-        it,
-        options.end(),
+        it == options.end(),
+        true,
         platform::errors::InvalidArgument(
             "Cannot find option name: %s when trying to get IpuStrategy option",
             key));
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
index aa4c3638868d2..035b15b2770a7 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
@@ -285,7 +285,7 @@ Node *binary_cross_entropy_handler(Graph *graph, Node *node) {
     reduction = RemoveTailReduction(graph, node, "Out");
   }
   bool append_identity_loss =
-      is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Loss", node));
+      is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Out", node));
 
   auto x = GetInputVarNode("X", node);
   auto label = GetInputVarNode("Label", node);
@@ -478,12 +478,12 @@ Node *warpctc_handler(Graph *graph, Node *node) {
   auto loss = CreateBaseOp(
       graph,
       node,
-      "popart_ctcloss",
+      "popart_ctcloss_v2",
       {log_softmax_logits, cast_label, cast_logits_length, cast_label_length},
       append_identity_loss
           ? std::vector<Node *>{}
           : std::vector<Node *>{GetOutputVarNode("Loss", node)},
-      {{"blank", blank},
+      {{"blank", int64_t{blank}},
        {"reduction", reduction},
        {"outDataType", std::string("UNDEFINED")}});
   if (append_identity_loss) {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
index 1cbe9eb466382..410bfafaca16e 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
@@ -32,6 +32,39 @@ Node *custom_op_handler(Graph *graph, Node *node) {
 
 Node *print_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
+  auto print_output = node->outputs.front();
+  auto print_input = node->inputs.front();
+  if (print_output->outputs.size() == 0) {
+    LOG(WARNING) << "The output of Print OP is not used on IPU graph. Setting "
+                    "the input of Print as Output.";
+    for (auto &subnode : print_input->outputs) {
+      if (subnode == node) continue;
+      ConnectNodes(print_output, subnode);
+      DisConnectNodes(print_input, subnode);
+
+      // replace node_name in op_desc
+      std::vector<std::string> new_inputs;
+      auto subnode_inmap = subnode->Op()->Inputs();
+      for (auto &in_map : subnode_inmap) {
+        if (std::find(in_map.second.begin(),
+                      in_map.second.end(),
+                      print_input->Name()) != in_map.second.end()) {
+          std::transform(in_map.second.cbegin(),
+                         in_map.second.cend(),
+                         std::back_inserter(new_inputs),
+                         [&](const std::string &node_name) {
+                           if (node_name == print_input->Name()) {
+                             return print_output->Name();
+                           } else {
+                             return node_name;
+                           }
+                         });
+          subnode->Op()->SetInput(in_map.first, new_inputs);
+          subnode->Op()->Flush();
+        }
+      }
+    }
+  }
   auto print_phase = PADDLE_GET_CONST(std::string, op->GetAttr("print_phase"));
   int64_t print_gradient = 0;
   if (print_phase != "forward") {
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
index 14dcf65afeefd..017ed64893087 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -17,11 +17,13 @@
 #pragma once
 
 // Ops from AiGraphcoreOpset1
+OP_DECL(popart_copyvarupdate_v2, aiGraphcoreOpset.copyvarupdate, NONE) // NOLINT
 OP_DECL(popart_groupnormalization_v2, aiGraphcoreOpset.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon) ) // NOLINT
 OP_DECL(popart_subsample_v2, aiGraphcoreOpset.subsample, ARG(INT_VEC,strides) ) // NOLINT
 OP_DECL(popart_nop_v2, aiGraphcoreOpset.nop, NONE) // NOLINT
 OP_DECL(popart_scale_v2, aiGraphcoreOpset.scale, ARG(FLOAT,scale) ) // NOLINT
 OP_DECL(popart_scaledadd_v2, aiGraphcoreOpset.scaledadd, ARG(FLOAT,scale0) ARG(FLOAT,scale1) ) // NOLINT
+OP_DECL(popart_lstm_v2, aiGraphcoreOpset.lstm, ARG(INT,outputFullSequence) ) // NOLINT
 OP_DECL(popart_gelu_v2, aiGraphcoreOpset.gelu, NONE) // NOLINT
 OP_DECL(popart_detach_v2, aiGraphcoreOpset.detach, NONE) // NOLINT
 OP_DECL(popart_depthtospace_v2, aiGraphcoreOpset.depthtospace, ARG(INT,blocksize) ARG(STRING,mode) ) // NOLINT
@@ -32,8 +34,13 @@ OP_DECL(popart_dynamiczero_v2, aiGraphcoreOpset.dynamiczero, ARG(INT_VEC,axes) A
 OP_DECL(popart_dynamicadd_v2, aiGraphcoreOpset.dynamicadd, ARG(INT_VEC,axes) ARG(INT_VEC,sizes) ) // NOLINT
 OP_DECL(popart_sequenceslice_v2, aiGraphcoreOpset.sequenceslice, ARG(INT,zeroUnused) ) // NOLINT
 OP_DECL(popart_replicatedallreduce_v2, aiGraphcoreOpset.replicatedallreduce, OPT_ARG(INT_VEC,commGroup) ) // NOLINT
+OP_DECL(popart_l1loss_v2, aiGraphcoreOpset.l1loss, ARG(FLOAT,lambda) SIG_ARG(INT32,popart::ReductionType,reduction) ) // NOLINT
+OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT
+OP_DECL(popart_identityloss_v2, aiGraphcoreOpset.identityloss, SIG_ARG(INT32,popart::ReductionType,reduction) ) // NOLINT
+OP_DECL(popart_tensorremap_v2, aiGraphcoreOpset.tensorremap, ARG(INT,remap_type) ) // NOLINT
+OP_DECL(popart_ctcloss_v2, aiGraphcoreOpset.ctcloss, SIG_ARG(INT32,popart::ReductionType,reduction) ARG(INT,blank) ARG(STRING,outDataType) ) // NOLINT
+OP_DECL(popart__ctcloss_v2, aiGraphcoreOpset._ctcloss, SIG_ARG(INT32,popart::ReductionType,reduction) ARG(INT,blank) ARG(STRING,outDataType) ) // NOLINT
 OP_DECL(popart_ctcbeamsearchdecoder_v2, aiGraphcoreOpset.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) ) // NOLINT
-OP_DECL(popart_ctcloss, aiGraphcoreOpset.ctcloss, SIG_ARG(INT32,popart::ReductionType,reduction) ARG(INT32,blank) ARG(STRING,outDataType) ) // NOLINT
 OP_DECL(popart_shapeddropout_v2, aiGraphcoreOpset.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) ) // NOLINT
 OP_DECL(popart_atan2_v2, aiGraphcoreOpset.atan2, NONE) // NOLINT
 OP_DECL(popart_expm1_v2, aiGraphcoreOpset.expm1, NONE) // NOLINT
@@ -47,6 +54,9 @@ OP_DECL(popart_bitwiseor_v2, aiGraphcoreOpset.bitwiseor, NONE) // NOLINT
 OP_DECL(popart_bitwisexor_v2, aiGraphcoreOpset.bitwisexor, NONE) // NOLINT
 OP_DECL(popart_bitwisexnor_v2, aiGraphcoreOpset.bitwisexnor, NONE) // NOLINT
 OP_DECL(popart_reducemedian_v2, aiGraphcoreOpset.reducemedian, OPT_ARG(INT_VEC,axes) ARG(INT,keepdims) ) // NOLINT
+OP_DECL(popart_scatterreduce_v2, aiGraphcoreOpset.scatterreduce, ARG(INT,axis_size) ARG(INT,axis) SIG_ARG(INT32,popart::ScatterReduction,reduction) ) // NOLINT
+OP_DECL(popart_swish_v2, aiGraphcoreOpset.swish, NONE) // NOLINT
+OP_DECL(popart_incrementmod_v2, aiGraphcoreOpset.incrementmod, ARG(FLOAT,increment) ARG(FLOAT,modulus) ) // NOLINT
 // Ops from AiOnnxOpset11
 OP_DECL(popart_argmax, aiOnnxOpset.argmax, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
 OP_DECL(popart_argmin, aiOnnxOpset.argmin, ARG(INT,axis) ARG(INT,keepdims) ) // NOLINT
@@ -117,6 +127,7 @@ OP_DECL(popart_qlinearmatmul, aiOnnxOpset.qlinearmatmul, NONE) // NOLINT
 OP_DECL(popart_quantizelinear, aiOnnxOpset.quantizelinear, NONE) // NOLINT
 OP_DECL(popart_reversesequence, aiOnnxOpset.reversesequence, ARG(INT,batch_axis) ARG(INT,time_axis) ) // NOLINT
 OP_DECL(popart_roialign, aiOnnxOpset.roialign, ARG(STRING,mode) ARG(INT,output_height) ARG(INT,output_width) ARG(INT,sampling_ratio) ARG(FLOAT,spatial_scale) ) // NOLINT
+OP_DECL(popart_stringnormalizer, aiOnnxOpset.stringnormalizer, ARG(STRING,case_change_action) ARG(INT,is_case_sensitive) OPT_ARG(STRING,locale) ARG(STRING_VEC,stopwords) ) // NOLINT
 OP_DECL(popart_thresholdedrelu, aiOnnxOpset.thresholdedrelu, ARG(FLOAT,alpha) ) // NOLINT
 OP_DECL(popart_upsample, aiOnnxOpset.upsample, ARG(STRING,mode) ) // NOLINT
 // Ops from AiOnnxOpset9
@@ -138,6 +149,7 @@ OP_DECL(popart_prelu, aiOnnxOpset.prelu, NONE) // NOLINT
 OP_DECL(popart_shrink, aiOnnxOpset.shrink, ARG(FLOAT,bias) ARG(FLOAT,lambd) ) // NOLINT
 OP_DECL(popart_sign, aiOnnxOpset.sign, NONE) // NOLINT
 OP_DECL(popart_sinh, aiOnnxOpset.sinh, NONE) // NOLINT
+OP_DECL(popart_tfidfvectorizer, aiOnnxOpset.tfidfvectorizer, ARG(INT,max_gram_length) ARG(INT,max_skip_count) ARG(INT,min_gram_length) ARG(STRING,mode) ARG(INT_VEC,ngram_counts) ARG(INT_VEC,ngram_indexes) ARG(INT_VEC,pool_int64s) ARG(STRING_VEC,pool_strings) ARG(FLOAT_VEC,weights) ) // NOLINT
 OP_DECL(popart_where, aiOnnxOpset.where, NONE) // NOLINT
 // Ops from AiOnnxOpset8
 OP_DECL(popart_expand, aiOnnxOpset.expand, NONE) // NOLINT
@@ -153,10 +165,13 @@ OP_DECL(popart_asin, aiOnnxOpset.asin, NONE) // NOLINT
 OP_DECL(popart_atan, aiOnnxOpset.atan, NONE) // NOLINT
 OP_DECL(popart_cos, aiOnnxOpset.cos, NONE) // NOLINT
 OP_DECL(popart_div, aiOnnxOpset.div, NONE) // NOLINT
+OP_DECL(popart_gru, aiOnnxOpset.gru, ARG(INT,num_outputs) ARG(FLOAT_VEC,activation_alpha) ARG(FLOAT_VEC,activation_beta) ARG(STRING_VEC,activations) OPT_ARG(FLOAT,clip) ARG(STRING,direction) OPT_ARG(INT,hidden_size) ARG(INT,linear_before_reset) ) // NOLINT
+OP_DECL(popart_lstm, aiOnnxOpset.lstm, ARG(INT,num_outputs) ARG(FLOAT_VEC,activation_alpha) ARG(FLOAT_VEC,activation_beta) ARG(STRING_VEC,activations) OPT_ARG(FLOAT,clip) ARG(STRING,direction) OPT_ARG(INT,hidden_size) ARG(INT,input_forget) ) // NOLINT
 OP_DECL(popart_mul, aiOnnxOpset.mul, NONE) // NOLINT
 OP_DECL(popart_multinomial, aiOnnxOpset.multinomial, ARG(INT,dtype) ARG(INT,sample_size) OPT_ARG(FLOAT,seed) ) // NOLINT
 OP_DECL(popart_logical_or, aiOnnxOpset.logical_or, NONE) // NOLINT
 OP_DECL(popart_pow, aiOnnxOpset.pow, NONE) // NOLINT
+OP_DECL(popart_rnn, aiOnnxOpset.rnn, ARG(INT,num_outputs) ARG(FLOAT_VEC,activation_alpha) ARG(FLOAT_VEC,activation_beta) ARG(STRING_VEC,activations) OPT_ARG(FLOAT,clip) ARG(STRING,direction) OPT_ARG(INT,hidden_size) ) // NOLINT
 OP_DECL(popart_sin, aiOnnxOpset.sin, NONE) // NOLINT
 OP_DECL(popart_sub, aiOnnxOpset.sub, NONE) // NOLINT
 OP_DECL(popart_tan, aiOnnxOpset.tan, NONE) // NOLINT
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_custom.h b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
index 04c57cc0104de..9a69567aa6dbc 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_custom.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h
@@ -16,7 +16,6 @@
 
 #pragma once
 
-OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT
 OP_DECL(popart_identity_loss, aiGraphcoreOpset.identityloss, SIG_ARG(INT32,popart::ReductionType,reduction) ) // NOLINT
 
 // clang-format on
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 660d9badc302a..8a6a6782f443a 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -378,6 +378,11 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            py::arg("tensor"),
            py::arg("place"),
            py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::IPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
       .def("_copy_from",
            &TensorCopyFrom<paddle::platform::Place>,
            py::arg("tensor"),
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9961c1a106bdd..e9e2252f9065c 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1486,7 +1486,11 @@ def _can_use_interpreter_core(program, place):
                     # NOTE(dev): `set` always call TensorCopySync that is a
                     # blocking behavior. So we use `_copy_from` to replace it.
                     cpu_tensor = _as_lodtensor(data, core.CPUPlace())
-                    tensor._copy_from(cpu_tensor, self.place)
+                    # for ipu, tensor is allocated on cpu
+                    if core.is_compiled_with_ipu():
+                        tensor._copy_from(cpu_tensor, tensor._place())
+                    else:
+                        tensor._copy_from(cpu_tensor, self.place)
 
                 return new_exe.run(scope, list(feed.keys()), fetch_list,
                                    return_numpy)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
index a2da444519d29..33f4a331611ef 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -17,7 +17,8 @@
 import numpy as np
 import paddle
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+from paddle.jit import to_static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, IPUD2STest
 
 
 class TestBase(IPUOpTest):
@@ -106,5 +107,73 @@ def set_op_attrs(self):
         }
 
 
+class SimpleLayer(paddle.nn.Layer):
+
+    def __init__(self):
+        super(SimpleLayer, self).__init__()
+        self.conv = paddle.nn.Conv2D(in_channels=3,
+                                     out_channels=1,
+                                     kernel_size=2,
+                                     stride=1)
+
+    @to_static()
+    def forward(self, x, target=None):
+        x = self.conv(x)
+        print(x)
+        x = paddle.fluid.layers.flatten(x, axis=1)
+        if target is not None:
+            x = paddle.fluid.layers.softmax(x)
+            loss = paddle.fluid.layers.cross_entropy(x, target)
+            loss = paddle.incubate.identity_loss(loss, 1)
+            return x, loss
+        return x
+
+
+class TestD2S(IPUD2STest):
+
+    def setUp(self):
+        self.set_data_feed()
+
+    def set_data_feed(self):
+        self.data = paddle.uniform((8, 3, 10, 10), dtype='float32')
+        self.label = paddle.randint(0, 10, shape=[8], dtype='int64')
+
+    def _test(self, use_ipu=False):
+        paddle.seed(self.SEED)
+        np.random.seed(self.SEED)
+        model = SimpleLayer()
+        optim = paddle.optimizer.Adam(learning_rate=0.01,
+                                      parameters=model.parameters())
+
+        if use_ipu:
+            paddle.set_device('ipu')
+            ipu_strategy = paddle.static.IpuStrategy()
+            ipu_strategy.set_graph_config(num_ipus=1,
+                                          is_training=True,
+                                          micro_batch_size=1,
+                                          enable_manual_shard=False)
+            ipu_strategy.set_optimizer(optim)
+
+        result = []
+        for _ in range(2):
+            # ipu only needs call model() to do forward/backward/grad_update
+            pred, loss = model(self.data, self.label)
+            if not use_ipu:
+                loss.backward()
+                optim.step()
+                optim.clear_grad()
+            result.append(loss)
+
+        if use_ipu:
+            ipu_strategy.release_patch()
+
+        return np.array(result)
+
+    def test_training(self):
+        ipu_loss = self._test(True).flatten()
+        cpu_loss = self._test(False).flatten()
+        self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4))
+
+
 if __name__ == "__main__":
     unittest.main()

From 15c0c9d299bcfd7d871ef5966418f252153d62e1 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 27 Jul 2022 10:24:53 +0800
Subject: [PATCH 03/28] support auto fallback to  cpu kernel for cusom device
 (#44639)

---
 paddle/phi/core/kernel_factory.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index d6f6e60fe2d3d..ae9c16e0cc710 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -110,6 +110,7 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
                  << "] is not registered.";
   }
 #endif
+
   auto kernel_iter = iter->second.find(kernel_key);
   // TODO(chenweihang): polish refind impl here
   if (kernel_iter == iter->second.end() &&
@@ -118,6 +119,22 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
         kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype());
     kernel_iter = iter->second.find(any_layout_kernel_key);
   }
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (kernel_iter == iter->second.end()) {
+    // Fallback CPU backend
+    phi::KernelKey cpu_kernel_key(
+        phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
+    kernel_iter = iter->second.find(cpu_kernel_key);
+    if (kernel_iter == iter->second.end() &&
+        kernel_key.layout() != phi::DataLayout::ALL_LAYOUT) {
+      phi::KernelKey any_layout_kernel_key(
+          phi::Backend::CPU, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype());
+      kernel_iter = iter->second.find(any_layout_kernel_key);
+    }
+  }
+#endif
+
   PADDLE_ENFORCE_NE(
       kernel_iter,
       iter->second.end(),

From a71cfd8c4116d9cb74b942775e63f321b5c3b9fa Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Wed, 27 Jul 2022 10:32:05 +0800
Subject: [PATCH 04/28] fix dygraph bugs in broadcast_to api. (#44612)

---
 python/paddle/tensor/manipulation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 05823efc3f1a3..f3b67cf743deb 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3000,8 +3000,10 @@ def broadcast_to(x, shape, name=None):
             print(out)
             # [[1, 2, 3], [1, 2, 3]]
     """
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
         return _C_ops.final_state_expand(x, shape)
+    if _in_legacy_dygraph():
+        return _C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
         assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')

From ce7c79953ed093088bdc68f4e593d3f1fd3068bd Mon Sep 17 00:00:00 2001
From: freeliuzc <lzc842650834@gmail.com>
Date: Wed, 27 Jul 2022 10:46:43 +0800
Subject: [PATCH 05/28] add set_dtype for inverse_op (#44618)

---
 paddle/phi/infermeta/backward.cc | 1 +
 paddle/phi/infermeta/unary.cc    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 1eca092a5f22f..a33b9587c153c 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -419,6 +419,7 @@ void InverseGradInferMeta(const MetaTensor& out,
                           MetaTensor* dx) {
   if (dx) {
     dx->set_dims(dout.dims());
+    dx->set_dtype(out.dtype());
   }
 }
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index edc455225e4dc..c018e58a59a37 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -1059,6 +1059,7 @@ void InverseInferMeta(const MetaTensor& x, MetaTensor* out) {
   }
 
   out->set_dims(input_dims);
+  out->set_dtype(x.dtype());
   out->share_lod(x);
 }
 

From d62af8b463364ea6537380a7a14790253d6551d8 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 26 Jul 2022 21:59:14 -0500
Subject: [PATCH 06/28] refine overalls.cmake (#44623)

---
 cmake/coveralls.cmake | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 9c28903498729..e8263e48af3aa 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -75,36 +75,21 @@ if(WITH_COVERAGE)
         "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
   endif()
-  set(EXCLUDE_DIRS "demo/" "build/" "tests/" ".test_env/")
 
   if(WITH_GPU)
     file(
-      GLOB_RECURSE PADDLE_SOURCES
-      RELATIVE "${PROJECT_SOURCE_DIR}"
-      "*.cpp" "*.cc" ".c" "*.cu")
+      GLOB_RECURSE
+      PADDLE_SOURCES
+      "${PROJECT_SOURCE_DIR}/paddle/*.cpp"
+      "${PROJECT_SOURCE_DIR}/paddle/*.cc"
+      "${PROJECT_SOURCE_DIR}/paddle/*.cu"
+      "${PROJECT_SOURCE_DIR}/paddle/*.cu.cc"
+      "${PROJECT_SOURCE_DIR}/paddle/*.c")
   else()
-    file(
-      GLOB_RECURSE PADDLE_SOURCES
-      RELATIVE "${PROJECT_SOURCE_DIR}"
-      "*.cpp" "*.cc" "*.c")
+    file(GLOB_RECURSE PADDLE_SOURCES "${PROJECT_SOURCE_DIR}/paddle/*.cpp"
+         "${PROJECT_SOURCE_DIR}/paddle/*.cc" "${PROJECT_SOURCE_DIR}/paddle/*.c")
   endif()
 
-  # exclude trivial files in PADDLE_SOURCES
-  foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
-    foreach(TMP_PATH ${PADDLE_SOURCES})
-      string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
-      if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
-        list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
-      endif()
-    endforeach()
-  endforeach()
-
-  # convert to absolute path
-  set(PADDLE_SRCS "")
-  foreach(PADDLE_SRC ${PADDLE_SOURCES})
-    set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
-  endforeach()
-
   code_coverage("${PADDLE_SRCS}" ${COVERALLS_UPLOAD}
                 "${PROJECT_SOURCE_DIR}/cmake")
 endif()

From 122fff46f1b6182581ebb196446ff4db4a0d02bd Mon Sep 17 00:00:00 2001
From: BiynXu <62832681+BiynXu@users.noreply.github.com>
Date: Wed, 27 Jul 2022 11:16:02 +0800
Subject: [PATCH 07/28] [PHI]Add yaml and unittest for bmm op (#44625)

Add yaml and unittest for bmm op
---
 paddle/phi/api/yaml/legacy_api.yaml                | 10 ++++++++++
 paddle/phi/api/yaml/legacy_backward.yaml           |  9 +++++++++
 python/paddle/fluid/tests/unittests/test_bmm_op.py |  5 +++--
 python/paddle/tensor/linalg.py                     |  3 +++
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 81c2a4548d881..600f93683eff3 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -326,6 +326,16 @@
   kernel :
     func : bitwise_xor
 
+# bmm
+- api : bmm
+  args : (Tensor x, Tensor y)
+  output : Tensor
+  infer_meta :
+    func : BmmInferMeta
+  kernel :
+    func : bmm
+  backward : bmm_grad
+
 # box_coder
 - api : box_coder
   args : (Tensor prior_box, Tensor prior_box_var, Tensor target_box, str code_type, bool box_normalized, int axis, float[] variance)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 47183fed746d0..310cf7c151ff2 100644
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -260,6 +260,15 @@
   kernel :
     func : bilinear_tensor_product_grad
 
+- backward_api : bmm_grad
+  forward : bmm (Tensor x, Tensor y) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : BmmGradInferMeta
+  kernel :
+    func : bmm_grad
+
 - backward_api : brelu_grad
   forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float t_min, float t_max)
diff --git a/python/paddle/fluid/tests/unittests/test_bmm_op.py b/python/paddle/fluid/tests/unittests/test_bmm_op.py
index b9a5853c492f5..5e5c41ae88279 100644
--- a/python/paddle/fluid/tests/unittests/test_bmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py
@@ -27,6 +27,7 @@ class TestBmmOp(OpTest):
 
     def setUp(self):
         self.op_type = "bmm"
+        self.python_api = paddle.tensor.bmm
         X = np.random.random((10, 3, 4)).astype("float64")
         Y = np.random.random((10, 4, 5)).astype("float64")
         self.inputs = {'X': X, 'Y': Y}
@@ -34,10 +35,10 @@ def setUp(self):
         self.outputs = {'Out': Out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_checkout_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        self.check_grad(['X', 'Y'], 'Out', check_eager=True)
 
 
 class API_TestBmm(unittest.TestCase):
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index a77d6b5a2ad92..d1468765b5907 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1521,6 +1521,9 @@ def bmm(x, y, name=None):
             "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}"
             .format(x_shape, y_shape))
 
+    if in_dygraph_mode():
+        return _C_ops.final_state_bmm(x, y)
+
     if paddle.in_dynamic_mode():
         return _C_ops.bmm(x, y)
 

From eafd4280cdb3b287daeb1e778aed3b84849d9699 Mon Sep 17 00:00:00 2001
From: Wang Bojun <105858416+wwbitejotunn@users.noreply.github.com>
Date: Wed, 27 Jul 2022 11:16:37 +0800
Subject: [PATCH 08/28] Phi average accumulates migration (#44554)

* move average_accumulates op to phi kernel
---
 .../final_state_generator/python_c_gen.py     |   2 +
 .../fluid/operators/average_accumulates_op.cc | 100 ++----------
 .../fluid/operators/average_accumulates_op.cu |  90 -----------
 .../fluid/operators/average_accumulates_op.h  | 119 --------------
 paddle/phi/api/yaml/legacy_api.yaml           |  11 ++
 paddle/phi/infermeta/multiary.cc              |  62 ++++++++
 paddle/phi/infermeta/multiary.h               |  17 ++
 .../phi/kernels/average_accumulates_kernel.h  |  57 +++++++
 .../kernels/cpu/average_accumulates_kernel.cc |  56 +++++++
 .../kernels/gpu/average_accumulates_kernel.cu | 100 ++++++++++++
 .../impl/average_accumulates_kernel_impl.h    | 146 ++++++++++++++++++
 .../phi/ops/compat/average_accumulates_sig.cc |  39 +++++
 .../paddle/incubate/optimizer/modelaverage.py |  10 +-
 13 files changed, 510 insertions(+), 299 deletions(-)
 delete mode 100644 paddle/fluid/operators/average_accumulates_op.cu
 delete mode 100644 paddle/fluid/operators/average_accumulates_op.h
 create mode 100644 paddle/phi/kernels/average_accumulates_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/average_accumulates_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/average_accumulates_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/average_accumulates_sig.cc

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index f358e4d332d64..81210afe5d89d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -57,6 +57,8 @@ def SkipAPIGeneration(forward_api_name):
     'adam',
     'adamw_',
     'adamw',
+    'average_accumulates',
+    'average_accumulates_',
     'decayed_adagrad_',
     'decayed_adagrad',
     'dgc_momentum_',
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
index 856a703fd2b06..9f8f295c24935 100644
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -12,99 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
 
-template <>
-void GetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
-                                      int64_t* num_updates,
-                                      int64_t* num_accumulates,
-                                      int64_t* old_num_accumulates) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-
-  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
-  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
-  *num_updates = in_num_updates->data<int64_t>()[0];
-}
-
-template <>
-void SetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
-                                      int64_t num_updates,
-                                      int64_t num_accumulates,
-                                      int64_t old_num_accumulates) {
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-
-  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
-  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
-  out_num_updates->data<int64_t>()[0] = num_updates;
-}
-
 class AverageAccumulatesOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("param"), "Input", "param", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_1"), "Input", "in_sum_1", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_2"), "Input", "in_sum_2", "AverageAccumulates");
-    OP_INOUT_CHECK(
-        ctx->HasInput("in_sum_3"), "Input", "in_sum_3", "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_num_accumulates"),
-                   "Input",
-                   "in_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_old_num_accumulates"),
-                   "Input",
-                   "in_old_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasInput("in_num_updates"),
-                   "Input",
-                   "in_num_updates",
-                   "AverageAccumulates");
-
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_1"),
-                   "Output",
-                   "out_sum_1",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_2"),
-                   "Output",
-                   "out_sum_2",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_sum_3"),
-                   "Output",
-                   "out_sum_3",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_num_accumulates"),
-                   "Output",
-                   "out_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_old_num_accumulates"),
-                   "Output",
-                   "out_old_num_accumulates",
-                   "AverageAccumulates");
-    OP_INOUT_CHECK(ctx->HasOutput("out_num_updates"),
-                   "Output",
-                   "out_num_updates",
-                   "AverageAccumulates");
-    auto in_dim = ctx->GetInputDim("param");
-
-    ctx->SetOutputDim("out_sum_1", in_dim);
-    ctx->SetOutputDim("out_sum_2", in_dim);
-    ctx->SetOutputDim("out_sum_3", in_dim);
-    ctx->SetOutputDim("out_num_accumulates", {1});
-    ctx->SetOutputDim("out_old_num_accumulates", {1});
-    ctx->SetOutputDim("out_num_updates", {1});
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -209,12 +129,14 @@ And for a mini-batch in training, accumulators were computed as below steps:
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
+                            AverageAccumulatesInferShapeFunctor,
+                            PD_INFER_META(phi::AverageAccumulatesInferMeta));
+
 REGISTER_OPERATOR(
     average_accumulates,
     ops::AverageAccumulatesOp,
     ops::AverageAccumulatesOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(average_accumulates,
-                       ops::AverageAccumulatesKernel<phi::CPUContext, float>,
-                       ops::AverageAccumulatesKernel<phi::CPUContext, double>);
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    AverageAccumulatesInferShapeFunctor);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
deleted file mode 100644
index d793d528a5b18..0000000000000
--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/average_accumulates_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-
-namespace paddle {
-namespace operators {
-template <>
-void GetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t* num_updates_,
-    int64_t* num_accumulates_,
-    int64_t* old_num_accumulates_) {
-  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
-  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
-  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
-  auto stream = ctx.cuda_device_context().stream();
-  auto cuda_place = in_old_num_accumulates->place();
-  memory::Copy(platform::CPUPlace(),
-               old_num_accumulates_,
-               cuda_place,
-               in_old_num_accumulates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-  memory::Copy(platform::CPUPlace(),
-               num_accumulates_,
-               cuda_place,
-               in_num_accumulates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-  memory::Copy(platform::CPUPlace(),
-               num_updates_,
-               cuda_place,
-               in_num_updates->data<int64_t>(),
-               sizeof(int64_t),
-               stream);
-}
-
-template <>
-void SetAccumulators<paddle::platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t num_updates_,
-    int64_t num_accumulates_,
-    int64_t old_num_accumulates_) {
-  auto stream = ctx.cuda_device_context().stream();
-  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
-  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
-  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  auto cuda_place = out_old_num_accumulates->place();
-
-  memory::Copy(cuda_place,
-               out_old_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(),
-               &old_num_accumulates_,
-               sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place,
-               out_num_accumulates->data<int64_t>(),
-               platform::CPUPlace(),
-               &num_accumulates_,
-               sizeof(int64_t),
-               stream);
-  memory::Copy(cuda_place,
-               out_num_updates->data<int64_t>(),
-               platform::CPUPlace(),
-               &num_updates_,
-               sizeof(int64_t),
-               stream);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
deleted file mode 100644
index afa43f8c240c5..0000000000000
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext>
-void GetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t* num_updates,
-                     int64_t* num_accumulates,
-                     int64_t* old_num_accumulates);
-
-template <typename DeviceContext>
-void SetAccumulators(const framework::ExecutionContext& ctx,
-                     int64_t num_updates,
-                     int64_t num_accumulates,
-                     int64_t old_num_accumulates);
-
-template <typename DeviceContext, typename T>
-class AverageAccumulatesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // It is used to avoid loss of precision
-    static const int64_t kMaxNumAccumulates = 16384;
-    // Get accumulators from input
-    int64_t num_updates = 0;
-    int64_t num_accumulates = 0;
-    int64_t old_num_accumulates = 0;
-    GetAccumulators<DeviceContext>(
-        ctx, &num_updates, &num_accumulates, &old_num_accumulates);
-
-    // Get attrs
-    float average_window = ctx.Attr<float>("average_window");
-    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
-    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(
-        min_average_window,
-        max_average_window,
-        platform::errors::InvalidArgument(
-            "The min_average_window > "
-            "max_average_window is not right, min_average_window is %ld, "
-            "max_average_window is %ld.",
-            min_average_window,
-            max_average_window));
-
-    // Get inputs
-    auto* param = ctx.Input<Tensor>("param");
-    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
-    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
-    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
-    auto param_tensor = framework::EigenVector<T>::Flatten(*param);
-    auto in_sum_1_tensor = framework::EigenVector<T>::Flatten(*in_sum_1);
-    auto in_sum_2_tensor = framework::EigenVector<T>::Flatten(*in_sum_2);
-    auto in_sum_3_tensor = framework::EigenVector<T>::Flatten(*in_sum_3);
-
-    // Get outputs
-    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
-    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
-    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
-    auto out_sum_1_tensor = framework::EigenVector<T>::Flatten(*out_sum_1);
-    auto out_sum_2_tensor = framework::EigenVector<T>::Flatten(*out_sum_2);
-    auto out_sum_3_tensor = framework::EigenVector<T>::Flatten(*out_sum_3);
-
-    // Compute
-    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    phi::funcs::SetConstant<DeviceContext, T> constant_functor;
-    ++num_updates;
-    ++num_accumulates;
-    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
-    out_sum_2_tensor.device(place) = in_sum_2_tensor;
-    out_sum_3_tensor.device(place) = in_sum_3_tensor;
-    if (num_updates % kMaxNumAccumulates == 0) {
-      // Move the sum to a different buffer to avoid loss of precision due to
-      // too many sums.
-      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
-    }
-    if (num_accumulates >= min_average_window &&
-        num_accumulates >= std::min<int64_t>(max_average_window,
-                                             num_updates * average_window)) {
-      //  Now the average window is too long, discard the old sum.
-      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_1, 0.0);
-      constant_functor(
-          ctx.template device_context<DeviceContext>(), out_sum_2, 0.0);
-      old_num_accumulates = num_accumulates;
-      num_accumulates = 0;
-    }
-
-    // Set accumulators to output
-    SetAccumulators<DeviceContext>(
-        ctx, num_updates, num_accumulates, old_num_accumulates);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 600f93683eff3..bd48617037d28 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -264,6 +264,17 @@
   kernel :
     func : auc
 
+#average_accumulates
+- api : average_accumulates_
+  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
+  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
+  infer_meta:
+    func : AverageAccumulatesInferMeta
+  kernel :
+    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
+    data_type : param
+  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
+
 # batch_norm
 - api : batch_norm
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 1a05ad495c981..a524506c7f07b 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -434,6 +434,68 @@ void AucInferMeta(const MetaTensor& input,
   }
 }
 
+void AverageAccumulatesInferMeta(const MetaTensor& param,
+                                 const MetaTensor& in_sum_1,
+                                 const MetaTensor& in_sum_2,
+                                 const MetaTensor& in_sum_3,
+                                 const MetaTensor& in_num_accumulates,
+                                 const MetaTensor& in_old_num_accumulates,
+                                 const MetaTensor& in_num_updates,
+                                 float average_window,
+                                 int64_t max_average_window,
+                                 int64_t min_average_window,
+                                 MetaTensor* out_sum_1,
+                                 MetaTensor* out_sum_2,
+                                 MetaTensor* out_sum_3,
+                                 MetaTensor* out_num_accumulates,
+                                 MetaTensor* out_old_num_accumulates,
+                                 MetaTensor* out_num_updates) {
+  // auto in_dim = param.dims;
+  PADDLE_ENFORCE_NE(
+      out_sum_1,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_1) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(
+      out_sum_2,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_2) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(
+      out_sum_3,
+      nullptr,
+      errors::NotFound(
+          "Output(out_sum_3) of AverageAccumulates should not be null."));
+  PADDLE_ENFORCE_NE(out_num_accumulates,
+                    nullptr,
+                    errors::NotFound("Output(out_num_accumulates) of "
+                                     "AverageAccumulates should not be null."));
+
+  PADDLE_ENFORCE_NE(out_old_num_accumulates,
+                    nullptr,
+                    errors::NotFound("Output(out_old_num_accumulates) of "
+                                     "AverageAccumulates should not be null."));
+
+  PADDLE_ENFORCE_NE(
+      out_num_updates,
+      nullptr,
+      errors::NotFound(
+          "Output(out_num_updates) of AverageAccumulates should not be null."));
+
+  out_sum_1->set_dims(in_sum_1.dims());
+  out_sum_1->set_dtype(in_sum_1.dtype());
+  out_sum_2->set_dims(in_sum_2.dims());
+  out_sum_2->set_dtype(in_sum_2.dtype());
+  out_sum_3->set_dims(in_sum_3.dims());
+  out_sum_3->set_dtype(in_sum_3.dtype());
+  out_num_accumulates->set_dims({1});
+  out_num_accumulates->set_dtype(in_num_accumulates.dtype());
+  out_old_num_accumulates->set_dims({1});
+  out_old_num_accumulates->set_dtype(in_old_num_accumulates.dtype());
+  out_num_updates->set_dims({1});
+  out_num_updates->set_dtype(in_num_updates.dtype());
+}
+
 void BatchNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 06d2530cffa2c..60342dc58f5c9 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -134,6 +134,23 @@ void AucInferMeta(const MetaTensor& input,
                   MetaTensor* stat_neg_out,
                   MetaConfig config = MetaConfig());
 
+void AverageAccumulatesInferMeta(const MetaTensor& param,
+                                 const MetaTensor& in_sum_1,
+                                 const MetaTensor& in_sum_2,
+                                 const MetaTensor& in_sum_3,
+                                 const MetaTensor& in_num_accumulates,
+                                 const MetaTensor& in_old_num_accumulates,
+                                 const MetaTensor& in_num_updates,
+                                 float average_window,
+                                 int64_t max_average_window,
+                                 int64_t min_average_window,
+                                 MetaTensor* out_sum_1,
+                                 MetaTensor* out_sum_2,
+                                 MetaTensor* out_sum_3,
+                                 MetaTensor* out_num_accumulates,
+                                 MetaTensor* out_old_num_accumulates,
+                                 MetaTensor* out_num_updates);
+
 void BatchNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/kernels/average_accumulates_kernel.h b/paddle/phi/kernels/average_accumulates_kernel.h
new file mode 100644
index 0000000000000..63f2b362cfde3
--- /dev/null
+++ b/paddle/phi/kernels/average_accumulates_kernel.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename Context>
+void GetAccumulators(const Context& dev_ctx,
+                     const DenseTensor& in_num_accumulates,
+                     const DenseTensor& in_old_num_accumulates,
+                     const DenseTensor& in_num_updates,
+                     int64_t* num_updates,
+                     int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);
+
+template <typename Context>
+void SetAccumulators(const Context& dev_ctx,
+                     int64_t num_updates,
+                     int64_t num_accumulates,
+                     int64_t old_num_accumulates,
+                     DenseTensor* out_num_accumulates,
+                     DenseTensor* out_old_num_accumulates,
+                     DenseTensor* out_num_updates);
+
+template <typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                              const DenseTensor& param,
+                              const DenseTensor& in_sum_1,
+                              const DenseTensor& in_sum_2,
+                              const DenseTensor& in_sum_3,
+                              const DenseTensor& in_num_accumulates,
+                              const DenseTensor& in_old_num_accumulates,
+                              const DenseTensor& in_num_updates,
+                              float average_window,
+                              int64_t max_average_window,
+                              int64_t min_average_window,
+                              DenseTensor* out_sum_1,
+                              DenseTensor* out_sum_2,
+                              DenseTensor* out_sum_3,
+                              DenseTensor* out_num_accumulates,
+                              DenseTensor* out_old_num_accumulates,
+                              DenseTensor* out_num_updates);
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
new file mode 100644
index 0000000000000..14eb38d5b99b6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <>
+void GetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
+                                      const DenseTensor& in_num_accumulates,
+                                      const DenseTensor& in_old_num_accumulates,
+                                      const DenseTensor& in_num_updates,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
+  *old_num_accumulates = in_old_num_accumulates.data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates.data<int64_t>()[0];
+  *num_updates = in_num_updates.data<int64_t>()[0];
+}
+
+template <>
+void SetAccumulators<phi::CPUContext>(const phi::CPUContext& dev_ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates,
+                                      DenseTensor* out_num_accumulates,
+                                      DenseTensor* out_old_num_accumulates,
+                                      DenseTensor* out_num_updates) {
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(average_accumulates,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AverageAccumulatesKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
new file mode 100644
index 0000000000000..98a6699d9754f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -0,0 +1,100 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+#include "paddle/phi/kernels/impl/average_accumulates_kernel_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <>
+void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
+                                      const DenseTensor& in_num_accumulates,
+                                      const DenseTensor& in_old_num_accumulates,
+                                      const DenseTensor& in_num_updates,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
+  auto stream = dev_ctx.stream();
+  auto cuda_place = in_old_num_accumulates.place();
+  paddle::memory::Copy(phi::CPUPlace(),
+                       old_num_accumulates,
+                       cuda_place,
+                       in_old_num_accumulates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       num_accumulates,
+                       cuda_place,
+                       in_num_accumulates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       num_updates,
+                       cuda_place,
+                       in_num_updates.data<int64_t>(),
+                       sizeof(int64_t),
+                       stream);
+}
+
+template <>
+void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates,
+                                      DenseTensor* out_num_accumulates,
+                                      DenseTensor* out_old_num_accumulates,
+                                      DenseTensor* out_num_updates) {
+  int64_t* out_num_accumulates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_num_accumulates);
+  int64_t* out_old_num_accumulates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_old_num_accumulates);
+  int64_t* out_num_updates_ptr =
+      dev_ctx.template Alloc<int64_t>(out_num_updates);
+
+  auto stream = dev_ctx.stream();
+
+  auto cuda_place = out_old_num_accumulates->place();
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       out_num_accumulates_ptr,
+                       phi::CPUPlace(),
+                       &num_accumulates,
+                       sizeof(int64_t),
+                       stream);
+
+  paddle::memory::Copy(dev_ctx.GetPlace(),
+                       out_old_num_accumulates_ptr,
+                       phi::CPUPlace(),
+                       &old_num_accumulates,
+                       sizeof(int64_t),
+                       stream);
+
+  paddle::memory::Copy(cuda_place,
+                       out_num_updates_ptr,
+                       phi::CPUPlace(),
+                       &num_updates,
+                       sizeof(int64_t),
+                       stream);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(average_accumulates,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AverageAccumulatesKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
new file mode 100644
index 0000000000000..8731316317d47
--- /dev/null
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/kernels/average_accumulates_kernel.h"
+
+#include <algorithm>
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AverageAccumulatesKernel(const Context& dev_ctx,
+                              const DenseTensor& param,
+                              const DenseTensor& in_sum_1,
+                              const DenseTensor& in_sum_2,
+                              const DenseTensor& in_sum_3,
+                              const DenseTensor& in_num_accumulates,
+                              const DenseTensor& in_old_num_accumulates,
+                              const DenseTensor& in_num_updates,
+                              float average_window,
+                              int64_t max_average_window,
+                              int64_t min_average_window,
+                              DenseTensor* out_sum_1,
+                              DenseTensor* out_sum_2,
+                              DenseTensor* out_sum_3,
+                              DenseTensor* out_num_accumulates,
+                              DenseTensor* out_old_num_accumulates,
+                              DenseTensor* out_num_updates) {
+  // It is used to avoid loss of precision
+  static const int64_t kMaxNumAccumulates = 16384;
+  // Get accumulators from input
+  // int64_t num_updates = 0;
+  // int64_t num_accumulates = 0;
+  // int64_t old_num_accumulates = 0;
+
+  auto num_updates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* num_updates_cpu_ptr =
+      reinterpret_cast<int64_t*>(num_updates_cpu->ptr());
+
+  auto num_accumulates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* num_accumulates_cpu_ptr =
+      reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());
+
+  auto old_num_accumulates_cpu =
+      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+  int64_t* old_num_accumulates_cpu_ptr =
+      reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());
+
+  GetAccumulators<Context>(dev_ctx,
+                           in_num_accumulates,
+                           in_old_num_accumulates,
+                           in_num_updates,
+                           num_updates_cpu_ptr,
+                           num_accumulates_cpu_ptr,
+                           old_num_accumulates_cpu_ptr);
+  // Get attrs
+  // float average_window = ctx.Attr<float>("average_window");
+  // int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+  // int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+  PADDLE_ENFORCE_LE(
+      min_average_window,
+      max_average_window,
+      errors::InvalidArgument(
+          "The min_average_window > "
+          "max_average_window is not right, min_average_window is %ld, "
+          "max_average_window is %ld.",
+          min_average_window,
+          max_average_window));
+
+  // Get inputs
+  // auto* param = ctx.Input<Tensor>("param");
+  // auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+  // auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+  // auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+  auto param_tensor = EigenVector<T>::Flatten(param);
+  auto in_sum_1_tensor = EigenVector<T>::Flatten(in_sum_1);
+  auto in_sum_2_tensor = EigenVector<T>::Flatten(in_sum_2);
+  auto in_sum_3_tensor = EigenVector<T>::Flatten(in_sum_3);
+
+  // Get outputs
+  // auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+  // auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+  // auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+  dev_ctx.template Alloc<T>(out_sum_1);
+  dev_ctx.template Alloc<T>(out_sum_2);
+  dev_ctx.template Alloc<T>(out_sum_3);
+
+  auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+  auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+  auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+  // Compute
+  // auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+  auto& place = *dev_ctx.eigen_device();
+
+  funcs::SetConstant<Context, T> constant_functor;
+  ++(*num_updates_cpu_ptr);
+  ++(*num_accumulates_cpu_ptr);
+  out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+  out_sum_2_tensor.device(place) = in_sum_2_tensor;
+  out_sum_3_tensor.device(place) = in_sum_3_tensor;
+  if ((*num_updates_cpu_ptr) % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision due to
+    // too many sums.
+    out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+    constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+  }
+  if ((*num_accumulates_cpu_ptr) >= min_average_window &&
+      (*num_accumulates_cpu_ptr) >=
+          std::min<int64_t>(max_average_window,
+                            (*num_updates_cpu_ptr) * average_window)) {
+    //  Now the average window is too long, discard the old sum.
+    out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+    constant_functor(dev_ctx, out_sum_1, static_cast<T>(0));
+    constant_functor(dev_ctx, out_sum_2, static_cast<T>(0));
+    (*old_num_accumulates_cpu_ptr) = (*num_accumulates_cpu_ptr);
+    (*num_accumulates_cpu_ptr) = 0;
+  }
+
+  // Set accumulators to output
+  SetAccumulators<Context>(dev_ctx,
+                           *num_updates_cpu_ptr,
+                           *num_accumulates_cpu_ptr,
+                           *old_num_accumulates_cpu_ptr,
+                           out_num_accumulates,
+                           out_old_num_accumulates,
+                           out_num_updates);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc
new file mode 100644
index 0000000000000..c14e8ab357553
--- /dev/null
+++ b/paddle/phi/ops/compat/average_accumulates_sig.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+KernelSignature AverageAccumulatesOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "average_accumulates",
+      {"param",
+       "in_sum_1",
+       "in_sum_2",
+       "in_sum_3",
+       "in_num_accumulates",
+       "in_old_num_accumulates",
+       "in_num_updates"},
+      {"average_window", "max_average_window", "min_average_window"},
+      {"out_sum_1",
+       "out_sum_2",
+       "out_sum_3",
+       "out_num_accumulates",
+       "out_old_num_accumulates",
+       "out_num_updates"});
+}
+}  // namespace phi
+PD_REGISTER_ARG_MAPPING_FN(average_accumulates,
+                           phi::AverageAccumulatesOpArgumentMapping);
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 361827ba48de2..b7d499f77292e 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -21,6 +21,7 @@
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle import _C_ops
+from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
 
@@ -231,7 +232,14 @@ def _append_optimize_op(self, block, param_and_grad):
         old_num_accumulates = self._get_accumulator('old_num_accumulates',
                                                     param_and_grad[0])
         num_updates = self._get_accumulator('num_updates', param_and_grad[0])
-        if framework._non_static_mode():
+
+        if in_dygraph_mode():
+            _, _, _, _, _, _ = _C_ops.final_state_average_accumulates_(
+                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
+                old_num_accumulates, num_updates, self.average_window,
+                self.max_average_window, self.min_average_window)
+            return None
+        elif framework._non_static_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates(
                 param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
                 old_num_accumulates, num_updates, sum_1, sum_2, sum_3,

From 7ee442c40cb6f55d1962ec1dbbd2f85c62690eb7 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 27 Jul 2022 11:24:57 +0800
Subject: [PATCH 09/28] new exe not support pg (#44628)

---
 python/paddle/fluid/executor.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e9e2252f9065c..867b40c1fe094 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1398,20 +1398,26 @@ def _can_use_interpreter_core(program, place):
                 if program._program is None:
                     return False
 
-                # Unsupported case 2 : disabled by FLAGS_CONVERT_GRAPH_TO_PROGRAM
-                if os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM',
-                                  None) not in [1, '1', True, 'True', 'true']:
-                    return False
-
-                # Unsupported case 3: data parallel
+                # Unsupported case 2: data parallel
                 if program._is_data_parallel and len(
                         program._get_places(place, program._places)) != 1:
                     return False
 
+                # Unsupported case 3 : parallel graph
+                if core.globals()['FLAGS_enable_parallel_graph'] in [
+                        1, '1', True, 'True', 'true'
+                ]:
+                    return False
+
                 # Unsupported case 4: inference
                 if program._is_inference:
                     return False
 
+                # Unsupported case 5 : disabled by FLAGS_CONVERT_GRAPH_TO_PROGRAM
+                if os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM',
+                                  None) not in [1, '1', True, 'True', 'true']:
+                    return False
+
                 return True
             else:
                 if isinstance(program._graph, compiler.CompiledProgram):

From 2a5437a273e63b3764f97fe50101f7faa92958cf Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Wed, 27 Jul 2022 11:30:07 +0800
Subject: [PATCH 10/28] [CustomDevice]fix phi kernel header (#44637)

---
 paddle/phi/kernels/inverse_kernel.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/phi/kernels/inverse_kernel.h b/paddle/phi/kernels/inverse_kernel.h
index d8ebf39c362db..2fa90b6f179eb 100644
--- a/paddle/phi/kernels/inverse_kernel.h
+++ b/paddle/phi/kernels/inverse_kernel.h
@@ -14,9 +14,7 @@
 
 #pragma once
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/matrix_inverse.h"
 
 namespace phi {
 

From efb4d5c2c010b257ca95b2b424b1c35f0620d953 Mon Sep 17 00:00:00 2001
From: ronnywang <ronny1996@163.com>
Date: Wed, 27 Jul 2022 12:06:19 +0800
Subject: [PATCH 11/28] [CustomDevice] add process_group_xccl ut (#44632)

* [CustomDevice] add process_group_xccl ut

* update
---
 .../fluid/tests/custom_runtime/CMakeLists.txt |   1 +
 .../custom_device_multi_process_collective.py |  42 +++
 .../custom_runtime/process_group_xccl.py      | 241 ++++++++++++++++++
 .../test_collective_process_group_xccl.py     | 154 +++++++++++
 .../test_fleet_launch_custom_device.sh        |  28 ++
 5 files changed, 466 insertions(+)
 create mode 100644 python/paddle/fluid/tests/custom_runtime/custom_device_multi_process_collective.py
 create mode 100644 python/paddle/fluid/tests/custom_runtime/process_group_xccl.py
 create mode 100644 python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py
 create mode 100644 python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh

diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 482dc9cb1f3f6..04f01714b371b 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -5,6 +5,7 @@ if(WITH_CUSTOM_DEVICE)
     "test_*.py")
   string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+  list(REMOVE_ITEM TEST_OPS "test_collective_process_group_xccl")
   foreach(TEST_OP ${TEST_OPS})
     py_test(${TEST_OP} SRCS ${TEST_OP}.py)
   endforeach()
diff --git a/python/paddle/fluid/tests/custom_runtime/custom_device_multi_process_collective.py b/python/paddle/fluid/tests/custom_runtime/custom_device_multi_process_collective.py
new file mode 100644
index 0000000000000..903cc9c7899ed
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/custom_device_multi_process_collective.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train(prefix):
+    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
+    selected_custom_devices = os.getenv("FLAGS_selected_custom_cpus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+    device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
+    current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+
+    details = "selected_accelerators:{} selected_custom_devices:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
+            .format(selected_accelerators, selected_custom_devices, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
+
+    print(details)
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
+        f.write(details)
+
+
+if __name__ == '__main__':
+    prefix = sys.argv[1]
+    train(prefix)
diff --git a/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py
new file mode 100644
index 0000000000000..cba032241fb22
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/process_group_xccl.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+def init_process_group(strategy=None):
+    nranks = ParallelEnv().nranks
+    rank = ParallelEnv().local_rank
+    is_master = True if rank == 0 else False
+    store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks)
+    pg_group = core.ProcessGroupCustom(
+        store, rank, nranks,
+        paddle.CustomPlace(ParallelEnv().device_type,
+                           ParallelEnv().device_id))
+
+    return pg_group
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_xccl(self):
+        with _test_eager_guard():
+            paddle.set_device('custom_cpu:%d' %
+                              paddle.distributed.ParallelEnv().dev_id)
+
+            pg = init_process_group()
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                # assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                # assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if pg.rank() == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                # assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                # assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if pg.rank() == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+                assert task.is_completed()
+                # assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+                assert task.is_completed()
+                # assert np.array_equal(broadcast_result, tensor_y)
+
+            print("test broadcast api ok")
+
+            # test barrier
+            # rank 0
+            if pg.rank() == 0:
+                task = pg.barrier()
+                task.wait()
+            # rank 1
+            else:
+                task = pg.barrier()
+                task.wait()
+
+            print("test barrier api ok\n")
+            return
+
+            # test allgather
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            out_shape = list(self.shape)
+            out_shape[0] *= 2
+            out = np.random.random(out_shape).astype(self.dtype)
+            tensor_out = paddle.to_tensor(out)
+            if pg.rank() == 0:
+                task = pg.all_gather(tensor_x, tensor_out)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            # rank 1
+            else:
+                task = pg.all_gather(tensor_y, tensor_out)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
+            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
+                                 [out_shape[0]])
+            # assert np.array_equal(tensor_x, out_1)
+            # assert np.array_equal(tensor_y, out_2)
+            print("test allgather api ok\n")
+
+            # test alltoall
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            out1 = np.random.random(self.shape).astype(self.dtype)
+            out2 = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            tensor_out1 = paddle.to_tensor(out1)
+            tensor_out2 = paddle.to_tensor(out2)
+            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
+                                          [self.shape[0]])
+            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
+                                          [self.shape[0] // 2])
+            if pg.rank() == 0:
+                task = pg.alltoall(tensor_x, tensor_out1)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            # rank 1
+            else:
+                task = pg.alltoall(tensor_y, tensor_out2)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
+                                  [self.shape[0]])
+            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
+            # if pg.rank() == 0:
+            #     assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
+            # else:
+            #     assert np.array_equal(out2_1, raw_tensor_x_2)
+            print("test alltoall api ok\n")
+
+            # test Reduce
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            sum_result = tensor_x + tensor_y
+            if pg.rank() == 0:
+                task = pg.reduce(tensor_x, 0)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            # rank 1
+            else:
+                task = pg.reduce(tensor_y, 0)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            # if pg.rank() == 0:
+            #     assert np.array_equal(tensor_x, sum_result)
+            print("test reduce sum api ok\n")
+
+            # test Scatter
+            # rank 0
+            in_shape = list(self.shape)
+            in_shape[0] *= 2
+            x = np.random.random(in_shape).astype(self.dtype)
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            tensor_y = paddle.to_tensor(y)
+            if pg.rank() == 0:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            # rank 1
+            else:
+                task = pg.scatter(tensor_x, tensor_y, 0)
+                task.wait()
+                # paddle.fluid.core._custom_device_synchronize("custom_cpu", -1)
+            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
+            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
+                                [self.shape[0] * 2])
+            # if pg.rank() == 0:
+            #     assert np.array_equal(tensor_y, out1)
+            # else:
+            #     assert np.array_equal(tensor_y, out2)
+            print("test scatter api ok\n")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py
new file mode 100644
index 0000000000000..f2e22a292fed4
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import os
+import sys
+import copy
+import subprocess
+import time
+
+
+def start_local_trainers(cluster,
+                         pod,
+                         training_script,
+                         training_script_args,
+                         eager_mode=True,
+                         log_dir=None):
+    from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+
+    current_env = copy.copy(os.environ.copy())
+    #paddle broadcast ncclUniqueId use socket, and
+    #proxy maybe make trainers unreachable, so delete them.
+    #if we set them to "", grpc will log error message "bad uri"
+    #so just delete them.
+    current_env.pop("http_proxy", None)
+    current_env.pop("https_proxy", None)
+
+    procs = []
+
+    os.system("rm -rf log && mkdir -p log")
+    for idx, t in enumerate(pod.trainers):
+        proc_env = {
+            "FLAGS_selected_custom_cpus":
+            "%s" % ",".join([str(g) for g in t.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % t.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+            "PADDLE_DISTRI_CUSTOM_DEVICE_TYPE": "custom_cpu",
+        }
+
+        if not eager_mode:
+            proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
+
+        current_env.update(proc_env)
+
+        print("trainer proc env:{}".format(current_env))
+
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            cmd = "python -m coverage run --branch -p " + training_script
+        else:
+            cmd = "python -u " + training_script
+
+        print("start trainer proc:{} env:{}".format(cmd, proc_env))
+
+        fn = open("workerlog.%d" % idx, "a")
+        proc = subprocess.Popen(cmd.split(" "),
+                                env=current_env,
+                                stdout=fn,
+                                stderr=fn)
+
+        tp = TrainerProc()
+        tp.proc = proc
+        tp.rank = t.rank
+        tp.log_fn = fn
+        tp.cmd = cmd
+
+        procs.append(tp)
+
+    return procs
+
+
+def get_cluster_from_args(selected_gpus):
+    from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+
+    cluster_node_ips = '127.0.0.1'
+    node_ip = '127.0.0.1'
+
+    node_ips = [x.strip() for x in cluster_node_ips.split(',')]
+
+    node_ips.index(node_ip)
+
+    free_ports = None
+
+    free_ports = find_free_ports(len(selected_gpus))
+    if free_ports is not None:
+        free_ports = list(free_ports)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
+
+
+class TestMultipleCustomCPU(unittest.TestCase):
+
+    def run_mnist_2custom_cpu(self, target_file_name, eager_mode=True):
+        from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+
+        selected_devices = [0, 1]
+        cluster = None
+        pod = None
+
+        cluster, pod = get_cluster_from_args(selected_devices)
+
+        procs = start_local_trainers(cluster,
+                                     pod,
+                                     eager_mode=eager_mode,
+                                     training_script=target_file_name,
+                                     training_script_args=[])
+
+        while True:
+            alive = watch_local_trainers(procs, cluster.trainers_endpoints())
+
+            if not alive:
+                print("Local procs complete, POD info:{}".format(pod))
+                break
+            time.sleep(3)
+
+
+class TestProcessGroup(TestMultipleCustomCPU):
+
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        cmd = 'rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && cd PaddleCustomDevice/backends/custom_cpu && mkdir build && cd build && cmake .. && make -j8'
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
+            cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
+
+    def test_process_group_xccl(self):
+        from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+
+        self.run_mnist_2custom_cpu('process_group_xccl.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh b/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh
new file mode 100644
index 0000000000000..3afb1979905d3
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+
+
+rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && pushd PaddleCustomDevice/backends/custom_cpu && mkdir build && pushd build && cmake .. && make -j8 && popd && popd
+
+echo "begin test use custom_cpu"
+
+export FLAGS_selected_custom_cpus=0,1
+
+distributed_args="--ips=127.0.0.1 --backend=xccl --custom_device_type=custom_cpu --custom_devices=0,1 --run_mode=collective --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} custom_device_multi_process_collective.py fleetlaunch_custom_cpu

From e7c7280fff53d3b951d62cfb2a92d52350709b08 Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Wed, 27 Jul 2022 12:40:36 +0800
Subject: [PATCH 12/28] Fix conv api name (#44636)

---
 paddle/phi/api/yaml/sparse_api.yaml                 | 2 +-
 paddle/phi/tests/api/test_sparse_conv_api.cc        | 2 +-
 python/paddle/incubate/sparse/nn/functional/conv.py | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml
index 904b185448918..e11306f21f24e 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_api.yaml
@@ -80,7 +80,7 @@
     data_type : x
   backward : cast_grad
 
-- api : conv3d_coo
+- api : conv3d
   args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(out), Tensor(rulebook), Tensor(counter) 
   kernel :
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index b1df197f42f47..123f824f62d51 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -76,7 +76,7 @@ void TestConv3dBase(const std::vector<int>& indices,
          kernel.size() * sizeof(T));
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto tensor_out = paddle::experimental::sparse::conv3d_coo(
+    auto tensor_out = paddle::experimental::sparse::conv3d(
         x, weight, paddings, dilations, strides, 1, false, "Conv3d");
 
     auto out =
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/incubate/sparse/nn/functional/conv.py
index 60cbb94bea236..605cadc2b091e 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/incubate/sparse/nn/functional/conv.py
@@ -63,9 +63,9 @@ def _conv3d(x,
     dilation = convert_to_list(dilation, dims, 'dilation')
     op_type = "conv3d"
 
-    pre_bias = _C_ops.final_state_sparse_conv3d_coo(
-        x, weight, padding, dilation, stride, groups, subm,
-        key if key is not None else "")
+    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
+                                                stride, groups, subm,
+                                                key if key is not None else "")
     if bias is not None:
         values = pre_bias.values()
         add_bias = elementwise_add(values, bias, axis=1)

From 28aa0c61a624111db5beac7699bf4ab391e2accc Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 27 Jul 2022 13:46:35 +0800
Subject: [PATCH 13/28] [DCU] Fix NAN problem when training BERT on DUC
 platform (#44643)

---
 .../operators/optimizers/distributed_fused_lamb_op.cu     | 8 ++++++++
 .../fluid/platform/device/gpu/rocm/rocm_device_function.h | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 53c8eddd7246c..f8d55ff9cf72a 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -166,7 +166,11 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
 
   constexpr int kNumTensor = MaxTensorNumPerLaunch;
   constexpr int kNumChunk = MaxChunkNumPerLaunch;
+#ifdef PADDLE_WITH_HIP
+  constexpr int kBlockDim = 256;
+#else
   constexpr int kBlockDim = 512;
+#endif
 
   int max_chunk_num = -1;
   int vec_size = 8;
@@ -805,7 +809,11 @@ static void MultiTensorUpdateLambParamAndBetaPows(
         platform::errors::InvalidArgument("Beta2Pow should be nullptr."));
   }
 
+#ifdef PADDLE_WITH_HIP
+  const int block_dim = 256;
+#else
   const int block_dim = 512;
+#endif
 
   int vec_size = 8;
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index da95fc3c164ba..a8ce5f1a1827b 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -134,7 +134,11 @@ __device__ T reduceSum(T val, int tid, int len) {
   // I use Warp-Level Parallelism and assume the Warp size
   // is 32 which may be different for different GPU,
   // but most card's warp size is 32.
+#ifdef PADDLE_WITH_HIP
+  const int warpSize = 64;
+#else
   const int warpSize = 32;
+#endif
   __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);

From 0dae79a9f73936c4d0bfbd8073de1840e310e7ad Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Wed, 27 Jul 2022 14:03:17 +0800
Subject: [PATCH 14/28] [JitLayer]Remove include fluid head files in JitLayer
 (#44597)

* Remove include fluid head files in JitLayer

* Format code

* Remove const to fix ci error

* Fix param error

* Polish jit layer include and cp some headers to python/include

* Fix comment
---
 paddle/fluid/jit/all.h               | 20 ++++++++++++++++++++
 paddle/fluid/jit/base_function.h     |  1 -
 paddle/fluid/jit/compilation_unit.cc |  2 ++
 paddle/fluid/jit/compilation_unit.h  |  5 +++--
 paddle/fluid/jit/function_schema.cc  | 14 +++++++-------
 paddle/fluid/jit/function_schema.h   | 12 +++++++-----
 paddle/fluid/jit/function_utils.cc   |  4 +++-
 paddle/fluid/jit/function_utils.h    | 13 +++++++++----
 paddle/fluid/jit/layer.cc            | 20 ++++++++++++--------
 paddle/fluid/jit/layer.h             | 26 +++++++++++++++++---------
 paddle/fluid/jit/serializer.cc       |  8 ++++++--
 paddle/fluid/jit/serializer.h        | 20 +++++++++++++++-----
 paddle/fluid/jit/serializer_utils.cc |  5 +++++
 paddle/fluid/jit/serializer_utils.h  |  9 +++++++--
 python/setup.py.in                   | 12 ++++++++++--
 15 files changed, 123 insertions(+), 48 deletions(-)
 create mode 100644 paddle/fluid/jit/all.h

diff --git a/paddle/fluid/jit/all.h b/paddle/fluid/jit/all.h
new file mode 100644
index 0000000000000..5a571a72a2824
--- /dev/null
+++ b/paddle/fluid/jit/all.h
@@ -0,0 +1,20 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base_function.h"
+#include "layer.h"
+#include "serializer.h"
+#include "serializer_utils.h"
diff --git a/paddle/fluid/jit/base_function.h b/paddle/fluid/jit/base_function.h
index df774d8fd84c7..50dadaf4ae227 100644
--- a/paddle/fluid/jit/base_function.h
+++ b/paddle/fluid/jit/base_function.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/core/dense_tensor.h"
 
 namespace paddle {
 namespace jit {
diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc
index 60d42d045b0e3..5a434fba176d3 100644
--- a/paddle/fluid/jit/compilation_unit.cc
+++ b/paddle/fluid/jit/compilation_unit.cc
@@ -16,6 +16,8 @@
 
 #include "paddle/phi/core/enforce.h"
 
+#include "paddle/fluid/jit/base_function.h"
+
 namespace paddle {
 namespace jit {
 
diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h
index 45a771b649401..535e92fe88473 100644
--- a/paddle/fluid/jit/compilation_unit.h
+++ b/paddle/fluid/jit/compilation_unit.h
@@ -14,13 +14,14 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <unordered_map>
-
-#include "paddle/fluid/jit/base_function.h"
+#include <vector>
 
 namespace paddle {
 namespace jit {
+class BaseFunction;
 using Name2FunctionMap =
     std::unordered_map<std::string, std::shared_ptr<BaseFunction>>;
 
diff --git a/paddle/fluid/jit/function_schema.cc b/paddle/fluid/jit/function_schema.cc
index 20cbcfdbd1c88..8150d3b2e7589 100644
--- a/paddle/fluid/jit/function_schema.cc
+++ b/paddle/fluid/jit/function_schema.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/jit/function_schema.h"
 
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/phi/core/enforce.h"
 
 #include "paddle/fluid/jit/function_utils.h"
@@ -52,14 +53,13 @@ void FunctionSchema::AddOutputArg(const std::string& name) {
 FunctionInfo::FunctionInfo(const std::string& func_name,
                            const std::vector<std::string>& param_names,
                            const framework::ProgramDesc& program_desc)
-    : func_name_(func_name),
-      param_names_(param_names),
-      program_desc_(program_desc) {
+    : func_name_(func_name), param_names_(param_names) {
+  program_desc_.reset(new framework::ProgramDesc(program_desc));
   // Parse FunctionSchema
-  for (auto& in_name : program_desc_.GetFeedTargetNames()) {
+  for (auto& in_name : program_desc_->GetFeedTargetNames()) {
     schema_.AddInputArg(in_name);
   }
-  for (auto& out_name : program_desc_.GetFetchTargetNames()) {
+  for (auto& out_name : program_desc_->GetFetchTargetNames()) {
     schema_.AddOutputArg(out_name);
   }
 }
@@ -67,7 +67,7 @@ FunctionInfo::FunctionInfo(const std::string& func_name,
 const std::string& FunctionInfo::FunctionName() const { return func_name_; }
 
 const framework::ProgramDesc& FunctionInfo::ProgramDesc() const {
-  return program_desc_;
+  return *program_desc_.get();
 }
 
 const std::vector<std::string>& FunctionInfo::ParamNames() const {
@@ -83,7 +83,7 @@ const std::vector<std::string> FunctionInfo::OutputArgNames() const {
 }
 
 void FunctionInfo::RemoveDescFeedFetch() {
-  utils::RemoveFeedFetch(&program_desc_);
+  utils::RemoveFeedFetch(program_desc_.get());
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/function_schema.h b/paddle/fluid/jit/function_schema.h
index 5dcea8517e40e..9f593dd7eee24 100644
--- a/paddle/fluid/jit/function_schema.h
+++ b/paddle/fluid/jit/function_schema.h
@@ -14,15 +14,17 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/variable.h"
-
 namespace paddle {
+
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+
 namespace jit {
-using Variable = paddle::framework::Variable;
 
 class Argument {
  public:
@@ -75,7 +77,7 @@ class FunctionInfo {
  private:
   std::string func_name_;
   std::vector<std::string> param_names_;
-  framework::ProgramDesc program_desc_;
+  std::shared_ptr<framework::ProgramDesc> program_desc_;
   FunctionSchema schema_;
 };
 
diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc
index a6da061de99dc..83da12d2652a3 100644
--- a/paddle/fluid/jit/function_utils.cc
+++ b/paddle/fluid/jit/function_utils.cc
@@ -15,7 +15,9 @@
 #include "paddle/fluid/jit/function_utils.h"
 
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/phi/core/enforce.h"
 
 namespace paddle {
@@ -75,7 +77,7 @@ void ShareParamsIntoScope(const std::vector<std::string> &param_names,
   for (size_t i = 0; i < param_names.size(); ++i) {
     std::string name = param_names[i];
     auto &param = params_dict.find(name)->second;
-    auto &dense_tensor = param.Get<DenseTensor>();
+    auto &dense_tensor = param->Get<DenseTensor>();
     VLOG(3) << "share into scope: " << name;
     auto *var = scope->Var(name);
     auto *dst_tensor = var->GetMutable<DenseTensor>();
diff --git a/paddle/fluid/jit/function_utils.h b/paddle/fluid/jit/function_utils.h
index ba1eaf7308be9..90e2e4b7f798f 100644
--- a/paddle/fluid/jit/function_utils.h
+++ b/paddle/fluid/jit/function_utils.h
@@ -18,18 +18,23 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/phi/core/dense_tensor.h"
 
 #include "paddle/fluid/jit/function_schema.h"
 
 namespace paddle {
+
+namespace framework {
+class Variable;
+class ProgramDesc;
+class Scope;
+}  // namespace framework
+
 namespace jit {
 using Variable = paddle::framework::Variable;
-using Name2VariableMap = std::unordered_map<std::string, Variable>;
+using Name2VariableMap =
+    std::unordered_map<std::string, std::shared_ptr<Variable>>;
 using DenseTensor = phi::DenseTensor;
 using Tensor = paddle::experimental::Tensor;
 
diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc
index f5985d71b0347..0e981bc45957f 100644
--- a/paddle/fluid/jit/layer.cc
+++ b/paddle/fluid/jit/layer.cc
@@ -14,17 +14,21 @@
 
 #include "paddle/fluid/jit/layer.h"
 
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/jit/base_function.h"
+#include "paddle/fluid/jit/compilation_unit.h"
+#include "paddle/fluid/jit/function_schema.h"
+
 namespace paddle {
 namespace jit {
-Layer::Layer(const std::vector<std::shared_ptr<FunctionInfo>>& infos,
-             const Name2VariableMap& params_dict,
-             const phi::Place& place)
+Layer::Layer(const Name2VariableMap& params_dict, const phi::Place& place)
     : params_dict_(params_dict) {
-  VLOG(3) << "infos size: " << infos.size();
+  unit_.reset(new CompilationUnit());
 }
 
 std::shared_ptr<BaseFunction> Layer::Function(const std::string& name) const {
-  return unit_.Function(name);
+  return unit_->Function(name);
 }
 
 std::vector<Tensor> Layer::forward(const std::vector<Tensor>& inputs) {
@@ -42,15 +46,15 @@ void Layer::to(const phi::Place& place) {}
 
 void Layer::SetFunction(const std::string& name,
                         const std::shared_ptr<BaseFunction>& function) {
-  unit_.SetFunction(name, function);
+  unit_->SetFunction(name, function);
 }
 
 std::vector<std::string> Layer::FunctionNames() const {
-  return unit_.FunctionNames();
+  return unit_->FunctionNames();
 }
 
 const Name2FunctionMap& Layer::FunctionMap() const {
-  return unit_.FunctionMap();
+  return unit_->FunctionMap();
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h
index ee75881fc3156..b2efa77fedf52 100644
--- a/paddle/fluid/jit/layer.h
+++ b/paddle/fluid/jit/layer.h
@@ -18,23 +18,31 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/variable.h"
+#include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/place.h"
 
-#include "paddle/fluid/jit/base_function.h"
-#include "paddle/fluid/jit/compilation_unit.h"
-#include "paddle/fluid/jit/function_schema.h"
+#include "base_function.h"
 
 namespace paddle {
+
+namespace framework {
+class Variable;
+}  // namespace framework
+
 namespace jit {
+class CompilationUnit;
+
+using DenseTensor = phi::DenseTensor;
+using Tensor = paddle::experimental::Tensor;
 using Variable = paddle::framework::Variable;
-using Name2VariableMap = std::unordered_map<std::string, Variable>;
+using Name2VariableMap =
+    std::unordered_map<std::string, std::shared_ptr<Variable>>;
+using Name2FunctionMap =
+    std::unordered_map<std::string, std::shared_ptr<BaseFunction>>;
 
 class Layer {
  public:
-  Layer(const std::vector<std::shared_ptr<FunctionInfo>>& infos,
-        const Name2VariableMap& params_dict,
-        const phi::Place& place);
+  Layer(const Name2VariableMap& params_dict, const phi::Place& place);
 
   std::shared_ptr<BaseFunction> Function(const std::string& name) const;
 
@@ -56,7 +64,7 @@ class Layer {
  private:
   Name2VariableMap params_dict_;
   Name2VariableMap attrs_dict_;
-  CompilationUnit unit_;
+  std::shared_ptr<CompilationUnit> unit_;
 };
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
index 2dee9ee879a22..c24995f711826 100644
--- a/paddle/fluid/jit/serializer.cc
+++ b/paddle/fluid/jit/serializer.cc
@@ -16,10 +16,14 @@
 
 #include <set>
 
+#include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/fluid/jit/executor_function.h"
+#include "paddle/fluid/jit/layer.h"
 #include "paddle/fluid/jit/pe_function.h"
+#include "paddle/fluid/jit/property.h"
 #include "paddle/fluid/jit/serializer_utils.h"
 
 DECLARE_string(jit_engine_type);
@@ -55,7 +59,7 @@ Layer Deserializer::operator()(const std::string& path,
   ReadTensorData(path + PDPARAMS_SUFFIX, param_names_set, place, &params_dict);
   // ReadAttributeData();
 
-  Layer layer = Layer(infos, params_dict, place);
+  Layer layer = Layer(params_dict, place);
 
   for (auto& info : infos) {
     if (FLAGS_jit_engine_type == "Executor") {
@@ -90,7 +94,7 @@ void Deserializer::ReadTensorData(const std::string& file_name,
     // TODO(dev): Support framework::Vocab
     DenseTensor* dense_tesnor = v.GetMutable<DenseTensor>();
     framework::DeserializeFromStream(fin, dense_tesnor, dev_ctx);
-    (*params_dict)[*it] = v;
+    (*params_dict)[*it] = std::make_shared<Variable>(v);
   }
 }
 
diff --git a/paddle/fluid/jit/serializer.h b/paddle/fluid/jit/serializer.h
index bdc3b81d55193..188239f469a57 100644
--- a/paddle/fluid/jit/serializer.h
+++ b/paddle/fluid/jit/serializer.h
@@ -14,16 +14,26 @@
 
 #pragma once
 
+#include <memory>
+#include <set>
 #include <string>
+#include <unordered_map>
 
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/jit/property.h"
-
-#include "paddle/fluid/jit/layer.h"
+#include "paddle/phi/common/place.h"
 
 namespace paddle {
+
+namespace framework {
+class Variable;
+class ProgramDesc;
+}  // namespace framework
+
 namespace jit {
+class Layer;
+using Variable = paddle::framework::Variable;
+using Name2VariableMap =
+    std::unordered_map<std::string, std::shared_ptr<Variable>>;
+
 // Export Layer into local disk
 class Serializer {
  public:
diff --git a/paddle/fluid/jit/serializer_utils.cc b/paddle/fluid/jit/serializer_utils.cc
index e68d75f58d56d..41bfa71b4ce25 100644
--- a/paddle/fluid/jit/serializer_utils.cc
+++ b/paddle/fluid/jit/serializer_utils.cc
@@ -17,6 +17,7 @@
 #include <dirent.h>
 #include <fstream>
 
+#include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
@@ -100,6 +101,10 @@ const std::vector<std::pair<std::string, std::string>> PdmodelFilePaths(
   return pdmodel_paths;
 }
 
+void InitKernelSignatureMap() {
+  paddle::framework::InitDefaultKernelSignatureMap();
+}
+
 }  // namespace utils
 }  // namespace jit
 }  // namespace paddle
diff --git a/paddle/fluid/jit/serializer_utils.h b/paddle/fluid/jit/serializer_utils.h
index dfa980544bc31..97850504d9661 100644
--- a/paddle/fluid/jit/serializer_utils.h
+++ b/paddle/fluid/jit/serializer_utils.h
@@ -17,9 +17,12 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/var_desc.h"
-
 namespace paddle {
+
+namespace framework {
+class VarDesc;
+}  // namespace framework
+
 namespace jit {
 static const char PDMODEL_SUFFIX[] = ".pdmodel";
 static const char PDPARAMS_SUFFIX[] = ".pdiparams";
@@ -40,6 +43,8 @@ bool FileExists(const std::string& file_path);
 const std::vector<std::pair<std::string, std::string>> PdmodelFilePaths(
     const std::string& path);
 
+void InitKernelSignatureMap();
+
 }  // namespace utils
 }  // namespace jit
 }  // namespace paddle
diff --git a/python/setup.py.in b/python/setup.py.in
index c02ef7f017fca..1b36b272d0d70 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -621,8 +621,12 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/kernels', recursive=True)) +  # phi kernels headers
     # capi headers
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/capi', recursive=True)) +  # phi capi headers
-    # utila api headers
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)))  # paddle utils headers
+    # utils api headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True))) # paddle utils headers
+
+jit_layer_headers = ['layer.h', 'serializer.h', 'serializer_utils.h', 'all.h', 'base_function.h']
+for f in jit_layer_headers:
+    headers += list(find_files(f, '@PADDLE_SOURCE_DIR@/paddle/fluid/jit', recursive=False))
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -667,6 +671,10 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            print('install_dir: ', install_dir)
+            if 'fluid/jit' in install_dir:
+                install_dir = re.sub('fluid/jit', 'jit', install_dir)
+                print('fluid/jit install_dir: ', install_dir)
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)

From 2bf574501fb615b77953255c1e29d4dc897a6d16 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 27 Jul 2022 14:18:07 +0800
Subject: [PATCH 15/28] [jit]  jit.save support property serialization (#44581)

* jit.save support peropty serilization

* extract set property function

* fix property test file name

* fix typing error

* fix typing error

* fix test coverage
---
 paddle/fluid/pybind/protobuf.cc               |  9 ---
 python/paddle/fluid/dygraph/io.py             |  1 +
 python/paddle/fluid/dygraph/jit.py            | 41 ++++++++++++-
 ...erty_save.py => test_jit_property_save.py} |  0
 .../tests/unittests/test_jit_save_load.py     | 57 +++++++++++++++++--
 5 files changed, 91 insertions(+), 17 deletions(-)
 rename python/paddle/fluid/tests/unittests/dygraph_to_static/{test_property_save.py => test_jit_property_save.py} (100%)

diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 25d72f0246400..4cdf135b8ed59 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -433,15 +433,6 @@ void BindJitProperty(pybind11::module *m) {
            "set list of string",
            py::arg("name"),
            py::arg("val"))
-      .def("set_tensor",
-           [](const pd::VarDesc &tensor, const std::string name) {
-             throw platform::errors::Unimplemented("Not implement set_tensor.");
-           })
-      .def(
-          "set_tensors",
-          [](const pybind11::list &tensors, const std::string name) {
-            throw platform::errors::Unimplemented("Not implement set_tensors.");
-          })
       .def("serialize_to_string", SerializeMessage<jit::Property>)
       .def("parse_from_string", DeserializeMessage<jit::Property>);
 }
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index a778cc3a1c688..7f91a15ff0149 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -37,6 +37,7 @@
 INFER_MODEL_SUFFIX = ".pdmodel"
 INFER_PARAMS_SUFFIX = ".pdiparams"
 INFER_PARAMS_INFO_SUFFIX = ".pdiparams.info"
+INFER_PROPERTY_SUFFIX = '.meta'
 
 LOADED_VAR_SUFFIX = "load"
 PARAMETER_NAME_PREFIX = "param"
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index c3c3838f4be03..f96cbb82a7359 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -22,6 +22,7 @@
 from collections import OrderedDict
 import inspect
 import threading
+from typing import Text, Tuple, Any, List
 
 import six
 import paddle
@@ -34,7 +35,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import ConversionOptions, CONVERSION_OPTIONS
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticFunction, unwrap_decorators
-from paddle.fluid.dygraph.io import TranslatedLayer, INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
+from paddle.fluid.dygraph.io import TranslatedLayer, INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX, INFER_PROPERTY_SUFFIX
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
 from paddle.fluid.framework import Block, ParamBase, Program, Variable, Parameter, EagerParamBase
@@ -644,6 +645,40 @@ def wrapper(layer, path, input_spec=None, **configs):
     return wrapper
 
 
+def _save_property(filename: Text, property_vals: List[Tuple[Any, Text]]):
+    """class property serialization.
+
+    Args:
+        filename (Text): *.meta
+        property_vals (List[Tuple): class property.
+    """
+
+    def set_property(meta, key, val):
+        if isinstance(val, float):
+            meta.set_float(key, val)
+        elif isinstance(val, int):
+            meta.set_int(key, val)
+        elif isinstance(val, str):
+            meta.set_string(key, val)
+        elif isinstance(val, (tuple, list)):
+            if isinstance(val[0], float):
+                meta.set_floats(key, val)
+            elif isinstance(val[0], int):
+                meta.set_ints(key, val)
+            elif isinstance(val[0], str):
+                meta.set_strings(key, val)
+        else:
+            raise ValueError(f"Note support val type: {type(val)}")
+        return
+
+    with open(filename, 'wb') as f:
+        meta = paddle.framework.core.Property()
+        for item in property_vals:
+            val, key = item[0], item[1]
+            set_property(meta, key, val)
+        f.write(meta.serialize_to_string())
+
+
 @_run_save_pre_hooks
 @switch_to_static_graph
 def save(layer, path, input_spec=None, **configs):
@@ -1043,7 +1078,9 @@ def fun(inputs):
                                         filter(paddle.fluid.io.is_persistable,
                                                ordered_vars)),
                                     filename=params_filename)
-        # TODO: save property
+        # save property
+        property_filename = file_prefix + INFER_PROPERTY_SUFFIX
+        _save_property(property_filename, property_vals)
 
     # NOTE(chenweihang): [ Save extra variable info ]
     # save_inference_model will lose some important variable information, including:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_property_save.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_jit_property_save.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_to_static/test_property_save.py
rename to python/paddle/fluid/tests/unittests/dygraph_to_static/test_jit_property_save.py
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index eab86141ba6b1..6aef26ac65ba0 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1156,7 +1157,7 @@ def forward(self, x):
 class Net(paddle.nn.Layer):
 
     def __init__(self):
-        super(Net, self).__init__()
+        super().__init__()
         self.fc1 = paddle.nn.Linear(4, 4)
         self.fc2 = paddle.nn.Linear(4, 4)
         self.bias = 0.4
@@ -1185,13 +1186,49 @@ def infer(self, input):
     def fbias(self):
         return self.bias + 1
 
-    # For extra Tensor
+    @paddle.jit.to_static(property=True)
+    def down_sampling(self):
+        return 4
+
+    @paddle.jit.to_static(property=True)
+    def fstr(self):
+        return "save str property"
+
+    @paddle.jit.to_static(property=True)
+    def ints(self):
+        return [10, 20]
+
+    @paddle.jit.to_static(property=True)
+    def floats(self):
+        return [1.1, 2.2]
+
+    @paddle.jit.to_static(property=True)
+    def strs(self):
+        return ["hello", "world"]
+
+
+class NetTensor(paddle.nn.Layer):
+
+    def __init__(self):
+        super().__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+        self.bias = 0.4
+        self.flag = paddle.ones([2], dtype="int32")
+
+    @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')])
+    def forward(self, x):
+        out = self.fc1(x)
+        out = paddle.nn.functional.relu(out)
+        out = paddle.mean(out)
+        return out
+
     @paddle.jit.to_static(property=True)
     def fflag(self):
-        return self.flag
+        return True
 
 
-class TestJitSaveCombine(unittest.TestCase):
+class TestJitSaveCombineProperty(unittest.TestCase):
 
     def setUp(self):
         # enable dygraph mode
@@ -1201,16 +1238,24 @@ def setUp(self):
     def tearDown(self):
         self.temp_dir.cleanup()
 
-    def test_save_load_finetune_load(self):
+    def test_jit_save_combine_property(self):
         model_path = os.path.join(self.temp_dir.name,
                                   "test_jit_save_combine/model")
-
         # Use new namespace
         with unique_name.guard():
             net = Net()
         #save
         paddle.jit.save(net, model_path, combine_params=True)
 
+    def test_jit_save_tensor_property(self):
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_jit_save_combine/model")
+        # Use new namespace
+        with unique_name.guard():
+            net = NetTensor()
+
+        paddle.jit.save(net, model_path, combine_params=True)
+
 
 class LayerLoadFinetune(paddle.nn.Layer):
 

From 16506d8ea5d9896c3bc3de41e43bc78785e8a38d Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Wed, 27 Jul 2022 14:26:25 +0800
Subject: [PATCH 16/28] Replaced add_custom_command with add_custom_target in
 xpu_kp_cmake (#44619)

* Replaced add_custom_command with add_custom_target in xpu_kp_cmake
---
 cmake/xpu_kp.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake
index 6692f24dd6ae9..8ba20bd14dd43 100644
--- a/cmake/xpu_kp.cmake
+++ b/cmake/xpu_kp.cmake
@@ -168,11 +168,13 @@ macro(compile_kernel COMPILE_ARGS)
   else()
     set(ABI_VERSION "-D_GLIBCXX_USE_CXX11_ABI=1")
   endif()
+  add_custom_target(
+    ${kernel_name}.xpu ALL
+    COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps
+            kernel_build/${kernel_name}.xpu)
   add_custom_command(
     OUTPUT kernel_build/${kernel_name}.bin.o
     COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build
-    COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps
-            kernel_build/${kernel_name}.xpu
     COMMAND
       ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL}
       -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS}
@@ -189,8 +191,6 @@ macro(compile_kernel COMPILE_ARGS)
   add_custom_command(
     OUTPUT kernel_build/${kernel_name}.host.o
     COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build
-    COMMAND ${CMAKE_COMMAND} -E copy ${kernel_path}/${kernel_name}.kps
-            kernel_build/${kernel_name}.xpu
     COMMAND
       ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 ${ABI_VERSION} ${OPT_LEVEL}
       -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS}

From 4b7fe610a6e8d52fc61f53e87796e738fdbb401e Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Wed, 27 Jul 2022 16:18:08 +0800
Subject: [PATCH 17/28] add adagrad and rmsprop yaml (#44631)

---
 paddle/phi/api/yaml/legacy_api.yaml           | 22 +++++++++++++++++++
 python/paddle/fluid/optimizer.py              | 19 ++++++++++++++--
 .../fluid/tests/unittests/test_adagrad_op.py  |  1 -
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index bd48617037d28..77c58816de694 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -48,6 +48,17 @@
   kernel :
     func : adadelta
 
+- api : adagrad_
+  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float epsilon)
+  output : Tensor(param_out), Tensor(moment_out)
+  infer_meta :
+    func : AdagradInferMeta
+  kernel :
+    func : adagrad {dense, dense, dense, dense -> dense, dense}
+           adagrad_dense_param_sparse_grad {dense, selected_rows, dense, dense -> dense, dense}
+    data_type : param
+  inplace : (param -> param_out), (moment -> moment_out)
+
 - api : adam_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
@@ -1851,6 +1862,17 @@
     func : reverse_array
   backward : reverse_array_grad
 
+- api : rmsprop_
+  args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, float epsilon, float decay, float momentum, bool centered)
+  output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out)
+  infer_meta :
+    func : RmspropInferMeta
+  kernel :
+    func : rmsprop {dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense}
+           rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense -> dense, dense, dense, dense}
+    optional : mean_grad
+  inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out)
+
 - api : roi_align
   args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned)
   output : Tensor
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c97809a069d5c..a3c68099089a3 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -2279,11 +2279,18 @@ def _append_optimize_op(self, block, param_and_grad):
 
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
-        if framework._non_static_mode():
+        if in_dygraph_mode():
+            _C_ops.final_state_adagrad_(param_and_grad[0], param_and_grad[1],
+                                        moment_acc,
+                                        self._create_param_lr(param_and_grad),
+                                        self._epsilon)
+            return None
+        elif _in_legacy_dygraph():
             _C_ops.adagrad(param_and_grad[0], param_and_grad[1], moment_acc,
                            self._create_param_lr(param_and_grad),
                            param_and_grad[0], moment_acc, "epsilon",
                            self._epsilon)
+            return None
         else:
             # Create the adagrad optimizer op
             adagrad_op = block.append_op(
@@ -3374,7 +3381,14 @@ def _append_optimize_op(self, block, param_and_grad):
                                                 param_and_grad[0])
         mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
                                               param_and_grad[0])
-        if framework._non_static_mode():
+        if in_dygraph_mode():
+            _C_ops.final_state_rmsprop_(param_and_grad[0], mean_square_acc,
+                                        param_and_grad[1], momentum_acc,
+                                        self._create_param_lr(param_and_grad),
+                                        mean_grad_acc, self._epsilon, self._rho,
+                                        self._momentum, self._centered)
+            return None
+        elif _in_legacy_dygraph():
             _C_ops.rmsprop(param_and_grad[0], mean_square_acc,
                            self._create_param_lr(param_and_grad),
                            param_and_grad[1], momentum_acc, param_and_grad[0],
@@ -3382,6 +3396,7 @@ def _append_optimize_op(self, block, param_and_grad):
                            "epsilon", self._epsilon, "decay", self._rho,
                            "momentum", self._momentum, "centered",
                            self._centered)
+            return None
         else:
             rmsprop_op = block.append_op(
                 type=self.type,
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op.py b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
index 4f290d4befa52..5f5d41ec2c062 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
@@ -29,7 +29,6 @@ class TestAdagradOp1(OpTest):
 
     def setUp(self):
         self.op_type = "adagrad"
-
         param = np.random.random((123, 321)).astype("float32")
         grad = np.random.random((123, 321)).astype("float32")
         moment = np.zeros((123, 321)).astype("float32")

From b20f771f71d3dea863c8e0fc487f402da4b4fd85 Mon Sep 17 00:00:00 2001
From: freeliuzc <lzc842650834@gmail.com>
Date: Wed, 27 Jul 2022 16:21:14 +0800
Subject: [PATCH 18/28] [phi] move crop_tensor kernel from fluid to phi
 (#44574)

* move crop_tensor from fluid to phi

* delete fluid header files

* fix crop_tensor_op dygraph_mode bug

* modify header files, add out tensor check
---
 paddle/fluid/operators/crop_tensor_op.cc      |  33 +-
 paddle/fluid/operators/crop_tensor_op.h       | 350 ------------------
 paddle/phi/api/yaml/legacy_api.yaml           |  10 +
 paddle/phi/api/yaml/legacy_backward.yaml      |  10 +
 paddle/phi/infermeta/backward.cc              |  12 +
 paddle/phi/infermeta/backward.h               |   5 +
 paddle/phi/infermeta/unary.cc                 |  41 ++
 paddle/phi/infermeta/unary.h                  |   6 +
 .../kernels/cpu/crop_tensor_grad_kernel.cc    |  28 ++
 paddle/phi/kernels/cpu/crop_tensor_kernel.cc  |  28 ++
 paddle/phi/kernels/crop_tensor_grad_kernel.h  |  29 ++
 paddle/phi/kernels/crop_tensor_kernel.h       |  29 ++
 .../kernels/gpu/crop_tensor_grad_kernel.cu    |  28 ++
 paddle/phi/kernels/gpu/crop_tensor_kernel.cu  |  28 ++
 .../impl/crop_tensor_grad_kernel_impl.h       | 105 ++++++
 .../kernels/impl/crop_tensor_kernel_impl.h    | 174 +++++++++
 paddle/phi/ops/compat/crop_tensor_sig.cc      |  74 ++++
 .../tests/unittests/test_crop_tensor_op.py    |   2 +
 python/paddle/tensor/manipulation.py          |   4 +
 19 files changed, 618 insertions(+), 378 deletions(-)
 delete mode 100644 paddle/fluid/operators/crop_tensor_op.h
 create mode 100644 paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/crop_tensor_kernel.cc
 create mode 100644 paddle/phi/kernels/crop_tensor_grad_kernel.h
 create mode 100644 paddle/phi/kernels/crop_tensor_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/crop_tensor_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/impl/crop_tensor_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/crop_tensor_sig.cc

diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index f72175d4d5338..52106c74314a4 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/crop_tensor_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
-#include <memory>
-#include <string>
-#include <vector>
+// TODO(freeliuzc): Delete old infershape
+// New infershape has already in unary.h and backward.h
 
 namespace paddle {
 namespace operators {
@@ -297,8 +296,8 @@ class CropTensorGradOpMaker : public framework::SingleGradOpMaker<T> {
  protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("crop_tensor_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     if (this->HasInput("OffsetsTensor")) {
       op->SetInput("OffsetsTensor", this->Input("OffsetsTensor"));
     }
@@ -314,32 +313,10 @@ class CropTensorGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OPERATOR(crop_tensor,
                   ops::CropTensorOp,
                   ops::CropTensorOpMaker,
                   ops::CropTensorGradOpMaker<paddle::framework::OpDesc>,
                   ops::CropTensorGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(crop_tensor_grad, ops::CropTensorOpGrad);
-REGISTER_OP_CPU_KERNEL(crop_tensor,
-                       ops::CropTensorKernel<phi::CPUContext, float>,
-                       ops::CropTensorKernel<phi::CPUContext, double>,
-                       ops::CropTensorKernel<phi::CPUContext, int>,
-                       ops::CropTensorKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(crop_tensor_grad,
-                       ops::CropTensorGradKernel<phi::CPUContext, float>,
-                       ops::CropTensorGradKernel<phi::CPUContext, double>,
-                       ops::CropTensorGradKernel<phi::CPUContext, int>,
-                       ops::CropTensorGradKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    crop_tensor,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_tensor_grad,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
deleted file mode 100644
index afaae4d0ac3cd..0000000000000
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ /dev/null
@@ -1,350 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace operators {  // Internal
-
-template <typename T,
-          size_t D,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using framework::Tensor;
-
-inline std::vector<int> get_new_data(
-    const std::vector<const Tensor*>& list_new_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_data;
-  for (size_t i = 0; i < list_new_tensor.size(); ++i) {
-    auto tensor = list_new_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(),
-        phi::make_ddim({1}),
-        platform::errors::InvalidArgument(
-            "The tensor's shape in list of Op(crop_tensor) should be [1], "
-            "but the value received is %d.",
-            tensor->dims()));
-    if (platform::is_gpu_place(tensor->place())) {
-      framework::Tensor temp;
-      paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-
-      vec_new_data.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_data.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-
-  return vec_new_data;
-}
-
-static framework::DDim ValidateShape(const std::vector<int> shape,
-                                     const std::vector<int> offsets,
-                                     const framework::DDim& in_dims) {
-  auto in_dim_size = in_dims.size();
-  auto shape_size = shape.size();
-  PADDLE_ENFORCE_EQ(
-      in_dim_size,
-      shape_size,
-      platform::errors::InvalidArgument(
-          "The number of elements (%d) for shape of Op(crop_tensor) should be "
-          "equal to the number of dimensions (%d) of the input tensor.",
-          shape_size,
-          in_dim_size));
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] <= 0 && in_dims[i] > 0) {
-      PADDLE_ENFORCE_NE(shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The value (%d) of the %uth element for shape of "
-                            "Op(crop_tensor) should not be zero.",
-                            shape[i],
-                            i));
-      PADDLE_ENFORCE_EQ(shape[i],
-                        -1,
-                        platform::errors::InvalidArgument(
-                            "When the value (%d) of the %uth "
-                            "element for shape of Op(crop_tensor)"
-                            " is negative, only -1 is supported.",
-                            shape[i],
-                            i));
-      output_shape[i] = in_dims[i] - offsets[i];
-    } else {
-      output_shape[i] = static_cast<int64_t>(shape[i]);
-    }
-  }
-
-  return phi::make_ddim(output_shape);
-}
-
-static std::vector<int> GetShape(const framework::ExecutionContext& ctx) {
-  std::vector<int> res;
-  int rank = ctx.Input<Tensor>("X")->dims().size();
-  auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("ShapeTensor");
-  if (list_new_shape_tensor.size() > 0) {
-    // have offsets tensor list
-    PADDLE_ENFORCE_EQ(
-        list_new_shape_tensor.size(),
-        rank,
-        platform::errors::InvalidArgument(
-            "The number of tensors (%d) for the input ShapeTensor of "
-            "Op(crop_tensor) must be equal to the number of "
-            "dimensions (%d) of the input.",
-            list_new_shape_tensor.size(),
-            rank));
-    res = get_new_data(list_new_shape_tensor);
-
-    return res;
-  }
-
-  auto* shape_tensor = ctx.HasInput("Shape")
-                           ? ctx.Input<framework::LoDTensor>("Shape")
-                           : nullptr;
-  if (shape_tensor) {
-    auto* shape_data = shape_tensor->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    if (platform::is_gpu_place(shape_tensor->place())) {
-      paddle::framework::TensorCopySync(
-          *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
-      shape_data = cpu_shape_tensor.data<int>();
-    }
-    res = std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-  }
-
-  return res;
-}
-
-static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
-  std::vector<int> res;
-  int rank = ctx.Input<Tensor>("X")->dims().size();
-  auto list_new_offsets_tensor =
-      ctx.MultiInput<framework::Tensor>("OffsetsTensor");
-  if (list_new_offsets_tensor.size() > 0) {
-    // have offsets tensor list
-    res = get_new_data(list_new_offsets_tensor);
-
-    return res;
-  }
-
-  if (ctx.HasInput("Offsets")) {
-    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
-    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The number of dimensions of input 'Offsets' must "
-                          "be 1, but the value received is: %d.",
-                          offsets_tensor->dims().size()));
-    PADDLE_ENFORCE_EQ(rank,
-                      offsets_tensor->dims()[0],
-                      platform::errors::InvalidArgument(
-                          "The number of elements (%d) for "
-                          "input 'Offsets' must be equal to "
-                          "the number of dimensions (%d) of the input tensor.",
-                          offsets_tensor->dims()[0],
-                          rank));
-
-    const int* offsets_data;
-    framework::Tensor cpu_tmp_tensor;
-    if (platform::is_cpu_place(offsets_tensor->place())) {
-      offsets_data = offsets_tensor->data<int>();
-    } else {
-      framework::TensorCopySync(
-          *offsets_tensor, platform::CPUPlace(), &cpu_tmp_tensor);
-      offsets_data = cpu_tmp_tensor.data<int>();
-    }
-    res = std::vector<int>(offsets_data, offsets_data + rank);
-  } else {
-    res = ctx.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        rank,
-        static_cast<int>(res.size()),
-        platform::errors::InvalidArgument("The number of elements (%d) for "
-                                          "input 'Offsets' must be equal to "
-                                          "the number of dimensions (%d) "
-                                          "of the input tensor.",
-                                          static_cast<int>(res.size()),
-                                          rank));
-  }
-  return res;
-}
-
-template <typename DeviceContext, typename T, size_t D>
-void CropTensorFunction(const framework::ExecutionContext& context) {
-  auto* x = context.Input<Tensor>("X");
-  auto* out = context.Output<Tensor>("Out");
-  auto x_dims = x->dims();
-  auto out_dims = out->dims();
-
-  // get shape from Input(ShapeTensor) of Input(Shape)
-  std::vector<int> shape = GetShape(context);
-  // out_dims set by arrt(shape)
-  if (shape.size() == 0) {
-    for (int i = 0; i < out_dims.size(); ++i) {
-      shape.push_back(out_dims[i]);
-    }
-  }
-
-  auto offsets = GetOffsets(context);
-  out_dims = ValidateShape(shape, offsets, x->dims());
-  out->mutable_data<T>(out_dims, context.GetPlace());
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    PADDLE_ENFORCE_LE(offsets[i] + shape[i],
-                      x_dims[i],
-                      platform::errors::InvalidArgument(
-                          "The sum of the %uth elements of "
-                          "offsets (%d) and shape (%d) of Op(crop_tensor) "
-                          "should be less than or "
-                          "equal to the size of %uth dimension of the input.",
-                          i,
-                          offsets[i],
-                          shape[i],
-                          i));
-  }
-
-  auto x_tensor = EigenTensor<T, D>::From(*x);
-  auto out_tensor = EigenTensor<T, D>::From(*out);
-  Eigen::DSizes<Eigen::DenseIndex, D> e_offsets;
-  Eigen::DSizes<Eigen::DenseIndex, D> e_shape;
-  for (size_t i = 0; i < D; ++i) {
-    e_offsets[i] = offsets[i];
-    e_shape[i] = out->dims()[i];
-  }
-  auto& place =
-      *context.template device_context<DeviceContext>().eigen_device();
-  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
-      place, out_tensor, x_tensor, e_offsets, e_shape);
-}
-
-template <typename DeviceContext, typename T>
-class CropTensorKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    int rank = context.Input<Tensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for "
-            "Op(crop_tensor) must be greater than or equal to 1, but the "
-            "value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        6,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for "
-            "Op(crop_tensor) must be less than or equal to 6, but the "
-            "value received is %d.",
-            rank));
-    switch (rank) {
-      case 1:
-        CropTensorFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropTensorFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropTensorFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropTensorFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropTensorFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropTensorFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-template <typename DeviceContext, typename T, size_t D>
-void CropTensorGradFunction(const framework::ExecutionContext& context) {
-  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
-  auto* x = context.Input<Tensor>("X");
-  if (d_x != nullptr) {
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    d_x->mutable_data<T>(x->dims(), context.GetPlace());
-    auto offsets = GetOffsets(context);
-    Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
-    for (size_t i = 0; i < D; ++i) {
-      paddings[i].first = offsets[i];
-      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
-    }
-    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
-    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
-        place, d_x_tensor, d_out_tensor, paddings, static_cast<T>(0));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CropTensorGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    size_t rank =
-        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'Out@GRAD' for "
-            "Op(crop_tensor_grad) must be greater than or equal to 1, but the "
-            "value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        6,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'Out@GRAD' for "
-            "Op(crop_tensor_grad) must be less than or equal to 6, but the "
-            "value received is %d.",
-            rank));
-    switch (rank) {
-      case 1:
-        CropTensorGradFunction<DeviceContext, T, 1>(context);
-        break;
-      case 2:
-        CropTensorGradFunction<DeviceContext, T, 2>(context);
-        break;
-      case 3:
-        CropTensorGradFunction<DeviceContext, T, 3>(context);
-        break;
-      case 4:
-        CropTensorGradFunction<DeviceContext, T, 4>(context);
-        break;
-      case 5:
-        CropTensorGradFunction<DeviceContext, T, 5>(context);
-        break;
-      case 6:
-        CropTensorGradFunction<DeviceContext, T, 6>(context);
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 77c58816de694..6a4afd3d0626b 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -513,6 +513,16 @@
     func : cosh
   backward : cosh_grad
 
+- api : crop_tensor
+  args : (Tensor x, IntArray shape, IntArray offsets)
+  output : Tensor(out)
+  infer_meta :
+    func : CropTensorInferMeta
+  kernel :
+    func : crop_tensor
+    data_type : x
+  backward : crop_tensor_grad
+
 # Part of python API paddle.nn.functional.cross_entropy
 - api : cross_entropy_with_softmax
   args : (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 310cf7c151ff2..9d73c044dbac9 100644
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -481,6 +481,16 @@
     func : cosh_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : crop_tensor_grad
+  forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, IntArray offsets)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : CropTensorGradInferMeta
+  kernel :
+    func : crop_tensor_grad
+    data_type : x
+
 - backward_api : cross_entropy_with_softmax_grad
   forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis) -> Tensor(softmax), Tensor(loss)
   args : (Tensor label, Tensor softmax, Tensor loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index a33b9587c153c..bfae939820ead 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -156,6 +156,18 @@ void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
   }
 }
 
+void CropTensorGradInferMeta(const MetaTensor& out_grad,
+                             const MetaTensor& x,
+                             const IntArray& offsets,
+                             MetaTensor* x_grad) {
+  auto x_dims = x.dims();
+
+  if (x_grad != nullptr) {
+    x_grad->set_dims(x_dims);
+    x_grad->set_dtype(x.dtype());
+  }
+}
+
 void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label,
                                           const MetaTensor& softmax,
                                           const MetaTensor& loss_grad,
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 5551b6bcbf183..16d9b82e06442 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -89,6 +89,11 @@ void Conv2dTransposeDoubleGradInferMeta(const MetaTensor& x,
                                         MetaTensor* dfilter,
                                         MetaTensor* ddout);
 
+void CropTensorGradInferMeta(const MetaTensor& out_grad,
+                             const MetaTensor& x,
+                             const IntArray& offsets,
+                             MetaTensor* x_grad);
+
 void CrossEntropyWithSoftmaxGradInferMeta(const MetaTensor& label,
                                           const MetaTensor& softmax,
                                           const MetaTensor& loss_grad,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index c018e58a59a37..3b31b165b4259 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -300,6 +300,47 @@ void CumInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void CropTensorInferMeta(const MetaTensor& x,
+                         const IntArray& shape,
+                         const IntArray& offsets,
+                         MetaTensor* out,
+                         MetaConfig config) {
+  PADDLE_ENFORCE_NE(
+      out,
+      nullptr,
+      errors::InvalidArgument("CropTensor should have output tensor out."));
+
+  auto x_dim = x.dims();
+  auto shape_dims = shape.GetData();
+  auto offsets_vec = offsets.GetData();
+
+  PADDLE_ENFORCE_EQ(shape_dims.size(),
+                    x_dim.size(),
+                    errors::InvalidArgument(
+                        "The number of elements (%d) of attribute 'shape' for "
+                        "CropTensor must be equal to the number of "
+                        "dimensions (%d) of the input.",
+                        shape_dims.size(),
+                        x_dim.size()));
+
+  if (config.is_runtime) {
+    out->share_lod(x);
+  }
+
+  auto out_dims = std::vector<int64_t>(shape.size(), -1);
+  for (size_t i = 0; i < shape_dims.size(); ++i) {
+    if (shape_dims[i] > 0) {
+      out_dims[i] = static_cast<int64_t>(shape_dims[i]);
+    } else {
+      if (shape_dims[i] == -1 && offsets_vec[i] != -1 && x_dim[i] != -1) {
+        out_dims[i] = x_dim[i] - static_cast<int64_t>(offsets_vec[i]);
+      }
+    }
+  }
+  out->set_dims(phi::make_ddim(out_dims));
+  out->set_dtype(x.dtype());
+}
+
 void DiagEmbedInferMeta(
     const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out) {
   auto x_dims = x.dims();
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 1449e8cfe197d..c1db2561f0bef 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -66,6 +66,12 @@ void ClipByNormInferMeta(const MetaTensor& x, float max_norm, MetaTensor* out);
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
+void CropTensorInferMeta(const MetaTensor& x,
+                         const IntArray& shape,
+                         const IntArray& offsets,
+                         MetaTensor* out,
+                         MetaConfig config = MetaConfig());
+
 void CumInferMeta(const MetaTensor& x,
                   int axis,
                   bool flatten,
diff --git a/paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc b/paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc
new file mode 100644
index 0000000000000..6ac553ec9786b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/crop_tensor_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/crop_tensor_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(crop_tensor_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CropTensorGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/crop_tensor_kernel.cc b/paddle/phi/kernels/cpu/crop_tensor_kernel.cc
new file mode 100644
index 0000000000000..8cd42d5fa8239
--- /dev/null
+++ b/paddle/phi/kernels/cpu/crop_tensor_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/crop_tensor_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/crop_tensor_kernel_impl.h"
+
+PD_REGISTER_KERNEL(crop_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CropTensorKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/crop_tensor_grad_kernel.h b/paddle/phi/kernels/crop_tensor_grad_kernel.h
new file mode 100644
index 0000000000000..97f1fbf5b029a
--- /dev/null
+++ b/paddle/phi/kernels/crop_tensor_grad_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CropTensorGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& out_grad,
+                          const IntArray& offsets,
+                          DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/crop_tensor_kernel.h b/paddle/phi/kernels/crop_tensor_kernel.h
new file mode 100644
index 0000000000000..079959eb05c14
--- /dev/null
+++ b/paddle/phi/kernels/crop_tensor_kernel.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CropTensorKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const IntArray& shape,
+                      const IntArray& offsets,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu b/paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu
new file mode 100644
index 0000000000000..0af80233cb1ef
--- /dev/null
+++ b/paddle/phi/kernels/gpu/crop_tensor_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/crop_tensor_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(crop_tensor_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CropTensorGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/crop_tensor_kernel.cu b/paddle/phi/kernels/gpu/crop_tensor_kernel.cu
new file mode 100644
index 0000000000000..5aa4900c5097b
--- /dev/null
+++ b/paddle/phi/kernels/gpu/crop_tensor_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/crop_tensor_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/crop_tensor_kernel_impl.h"
+
+PD_REGISTER_KERNEL(crop_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CropTensorKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h b/paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h
new file mode 100644
index 0000000000000..0d3e579fe8bc8
--- /dev/null
+++ b/paddle/phi/kernels/impl/crop_tensor_grad_kernel_impl.h
@@ -0,0 +1,105 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/crop_tensor_grad_kernel.h"
+
+#include <vector>
+
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename Context, typename T, size_t D>
+void CropTensorGradFunction(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out_grad,
+                            const IntArray& offsets,
+                            DenseTensor* x_grad) {
+  if (x_grad != nullptr) {
+    x_grad->Resize(x.dims());
+    dev_ctx.template Alloc<T>(x_grad);
+
+    auto offsets_vec = offsets.GetData();
+    std::array<std::pair<int64_t, int64_t>, D> paddings;
+    for (size_t i = 0; i < D; ++i) {
+      paddings[i].first = offsets_vec[i];
+      paddings[i].second =
+          x_grad->dims()[i] - out_grad.dims()[i] - offsets_vec[i];
+    }
+    auto x_grad_tensor = EigenTensor<T, D>::From(*x_grad);
+    auto out_grad_tensor = EigenTensor<T, D>::From(out_grad);
+    auto& place = *dev_ctx.eigen_device();
+
+    funcs::EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+        place, x_grad_tensor, out_grad_tensor, paddings, static_cast<T>(0));
+  }
+}
+
+template <typename T, typename Context>
+void CropTensorGradKernel(const Context& dev_ctx,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& x,
+                          const IntArray& offsets,
+                          DenseTensor* x_grad) {
+  size_t rank = out_grad.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument(
+          "The number of dimensions of the input 'Out@GRAD' for "
+          "Op(crop_tensor_grad) must be greater than or equal to 1, but the "
+          "value received is %d.",
+          rank));
+  PADDLE_ENFORCE_LE(
+      rank,
+      6,
+      errors::InvalidArgument(
+          "The number of dimensions of the input 'Out@GRAD' for "
+          "Op(crop_tensor_grad) must be less than or equal to 6, but the "
+          "value received is %d.",
+          rank));
+  switch (rank) {
+    case 1:
+      CropTensorGradFunction<Context, T, 1>(
+          dev_ctx, out_grad, x, offsets, x_grad);
+      break;
+    case 2:
+      CropTensorGradFunction<Context, T, 2>(
+          dev_ctx, out_grad, x, offsets, x_grad);
+      break;
+    case 3:
+      CropTensorGradFunction<Context, T, 3>(
+          dev_ctx, out_grad, x, offsets, x_grad);
+      break;
+    case 4:
+      CropTensorGradFunction<Context, T, 4>(
+          dev_ctx, out_grad, x, offsets, x_grad);
+      break;
+    case 5:
+      CropTensorGradFunction<Context, T, 5>(
+          dev_ctx, out_grad, x, offsets, x_grad);
+      break;
+    case 6:
+      CropTensorGradFunction<Context, T, 6>(
+          dev_ctx, out_grad, x, offsets, x_grad);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/crop_tensor_kernel_impl.h b/paddle/phi/kernels/impl/crop_tensor_kernel_impl.h
new file mode 100644
index 0000000000000..e6d7f8f672659
--- /dev/null
+++ b/paddle/phi/kernels/impl/crop_tensor_kernel_impl.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/crop_tensor_kernel.h"
+
+#include <utility>
+#include <vector>
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+static phi::DDim ValidateShape(const std::vector<int64_t>& shape,
+                               const std::vector<int64_t>& offsets,
+                               const phi::DDim& in_dims) {
+  auto in_dim_size = in_dims.size();
+  auto shape_size = shape.size();
+  PADDLE_ENFORCE_EQ(
+      in_dim_size,
+      shape_size,
+      errors::InvalidArgument(
+          "The number of elements (%d) for shape of Op(crop_tensor) should be "
+          "equal to the number of dimensions (%d) of the input tensor.",
+          shape_size,
+          in_dim_size));
+  std::vector<int64_t> output_shape(shape.size(), 0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] <= 0 && in_dims[i] > 0) {
+      PADDLE_ENFORCE_NE(shape[i],
+                        0,
+                        errors::InvalidArgument(
+                            "The value (%d) of the %uth element for shape of "
+                            "Op(crop_tensor) should not be zero.",
+                            shape[i],
+                            i));
+      PADDLE_ENFORCE_EQ(
+          shape[i],
+          -1,
+          errors::InvalidArgument("When the value (%d) of the %uth "
+                                  "element for shape of Op(crop_tensor)"
+                                  " is negative, only -1 is supported.",
+                                  shape[i],
+                                  i));
+      output_shape[i] = in_dims[i] - offsets[i];
+    } else {
+      output_shape[i] = static_cast<int64_t>(shape[i]);
+    }
+  }
+
+  return phi::make_ddim(output_shape);
+}
+
+template <typename Context, typename T, size_t D>
+void CropTensorFunction(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const IntArray& shape,
+                        const IntArray& offsets,
+                        DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto rank = x.dims().size();
+  auto out_dims = out->dims();
+
+  auto shape_vec = shape.GetData();
+
+  if (shape_vec.size() == 0) {
+    for (int i = 0; i < out_dims.size(); ++i) {
+      shape_vec.push_back(out_dims[i]);
+    }
+  }
+
+  auto offsets_vec = offsets.GetData();
+
+  PADDLE_ENFORCE_EQ(
+      rank,
+      static_cast<int>(offsets_vec.size()),
+      errors::InvalidArgument("The number of elements (%d) for "
+                              "input 'Offsets' must be equal to "
+                              "the number of dimensions (%d) "
+                              "of the input tensor.",
+                              static_cast<int>(offsets_vec.size()),
+                              rank));
+
+  out_dims = ValidateShape(shape_vec, offsets_vec, x.dims());
+  out->Resize(out_dims);
+  dev_ctx.template Alloc<T>(out);
+  for (size_t i = 0; i < offsets_vec.size(); ++i) {
+    PADDLE_ENFORCE_LE(offsets_vec[i] + shape_vec[i],
+                      x_dims[i],
+                      errors::InvalidArgument(
+                          "The sum of the %uth elements of "
+                          "offsets (%d) and shape (%d) of Op(crop_tensor) "
+                          "should be less than or "
+                          "equal to the size of %uth dimension of the input.",
+                          i,
+                          offsets_vec[i],
+                          shape_vec[i],
+                          i));
+  }
+
+  auto x_tensor = EigenTensor<T, D>::From(x);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+  Eigen::DSizes<Eigen::DenseIndex, D> e_offsets;
+  Eigen::DSizes<Eigen::DenseIndex, D> e_shape;
+  for (size_t i = 0; i < D; ++i) {
+    e_offsets[i] = offsets_vec[i];
+    e_shape[i] = out->dims()[i];
+  }
+  auto& place = *dev_ctx.eigen_device();
+  phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_tensor, x_tensor, e_offsets, e_shape);
+}
+
+template <typename T, typename Context>
+void CropTensorKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const IntArray& shape,
+                      const IntArray& offsets,
+                      DenseTensor* out) {
+  int rank = x.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank,
+      1,
+      errors::InvalidArgument(
+          "The number of dimensions of the input 'x' for "
+          "Op(crop_tensor) must be greater than or equal to 1, but the "
+          "value received is %d.",
+          rank));
+  PADDLE_ENFORCE_LE(
+      rank,
+      6,
+      errors::InvalidArgument(
+          "The number of dimensions of the input 'x' for "
+          "Op(crop_tensor) must be less than or equal to 6, but the "
+          "value received is %d.",
+          rank));
+  switch (rank) {
+    case 1:
+      CropTensorFunction<Context, T, 1>(dev_ctx, x, shape, offsets, out);
+      break;
+    case 2:
+      CropTensorFunction<Context, T, 2>(dev_ctx, x, shape, offsets, out);
+      break;
+    case 3:
+      CropTensorFunction<Context, T, 3>(dev_ctx, x, shape, offsets, out);
+      break;
+    case 4:
+      CropTensorFunction<Context, T, 4>(dev_ctx, x, shape, offsets, out);
+      break;
+    case 5:
+      CropTensorFunction<Context, T, 5>(dev_ctx, x, shape, offsets, out);
+      break;
+    case 6:
+      CropTensorFunction<Context, T, 6>(dev_ctx, x, shape, offsets, out);
+      break;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/crop_tensor_sig.cc b/paddle/phi/ops/compat/crop_tensor_sig.cc
new file mode 100644
index 0000000000000..994a7de8fb403
--- /dev/null
+++ b/paddle/phi/ops/compat/crop_tensor_sig.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature CropTensorOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("ShapeTensor") > 0) {
+    if (ctx.InputSize("OffsetsTensor") > 0) {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"ShapeTensor", "OffsetsTensor"}, {"Out"});
+    } else if (ctx.HasInput("Offsets")) {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"ShapeTensor", "Offsets"}, {"Out"});
+    } else {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"ShapeTensor", "offsets"}, {"Out"});
+    }
+  } else if (ctx.HasInput("Shape")) {
+    if (ctx.InputSize("OffsetsTensor") > 0) {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"Shape", "OffsetsTensor"}, {"Out"});
+    } else if (ctx.HasInput("Offsets")) {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"Shape", "Offsets"}, {"Out"});
+    } else {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"Shape", "offsets"}, {"Out"});
+    }
+  } else {
+    if (ctx.InputSize("OffsetsTensor") > 0) {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"shape", "OffsetsTensor"}, {"Out"});
+    } else if (ctx.HasInput("Offsets")) {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"shape", "Offsets"}, {"Out"});
+    } else {
+      return KernelSignature(
+          "crop_tensor", {"X"}, {"shape", "offsets"}, {"Out"});
+    }
+  }
+}
+
+KernelSignature CropTensorGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("OffsetsTensor") > 0) {
+    return KernelSignature(
+        "crop_tensor_grad", {"X", "Out@GRAD"}, {"OffsetsTensor"}, {"X@GRAD"});
+  } else if (ctx.HasInput("Offsets")) {
+    return KernelSignature(
+        "crop_tensor_grad", {"X", "Out@GRAD"}, {"Offsets"}, {"X@GRAD"});
+  } else {
+    return KernelSignature(
+        "crop_tensor_grad", {"X", "Out@GRAD"}, {"offsets"}, {"X@GRAD"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(crop_tensor, phi::CropTensorOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(crop_tensor_grad,
+                           phi::CropTensorGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
index 49805c578bf47..aa70e32cdc5b3 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
@@ -51,6 +51,7 @@ def setUp(self):
         self.offset_by_input = False
         self.unk_dim_idx = -1
         self.attrs = {}
+        self.python_api = paddle.crop
         self.initTestCase()
 
         if self.shape_by_input:
@@ -146,6 +147,7 @@ def setUp(self):
         self.OffsetsTensor = False
         self.ShapeTensor = True
         self.attrs = {}
+        self.python_api = paddle.crop
         self.initTestCase()
 
         if self.ShapeTensor:
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index f3b67cf743deb..8d7d91e2f2ec9 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -640,6 +640,7 @@ def crop(x, shape=None, offsets=None, name=None):
             # if offsets = [1, 1], out = [[5,6], [8,9]]
 
     """
+
     helper = LayerHelper('crop_tensor', **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
                              'crop_tensor')
@@ -650,6 +651,9 @@ def crop(x, shape=None, offsets=None, name=None):
     if offsets is None:
         offsets = [0] * len(x.shape)
 
+    if in_dygraph_mode():
+        return _C_ops.final_state_crop_tensor(x, shape, offsets)
+
     out = helper.create_variable_for_type_inference(x.dtype)
     ipts = {'X': x}
     attrs = {}

From be132719c5cc7a87df1923676be9cf205366f1cd Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 27 Jul 2022 16:36:51 +0800
Subject: [PATCH 19/28] fix RemoveIntermediateOut in fuse_elewise_add_act_pass
 while converting graph to program (#44593)

* fix RemoveNode in fuse_elewise_add_act_pass

* fix

* change pointer to share_ptr

* fix

* fix

* fix format

* fix

* fix graph_safe_remove_nodes
---
 .../framework/ir/fuse_elewise_add_act_pass.cc | 13 ++++++++++-
 paddle/fluid/framework/ir/graph.h             |  2 ++
 paddle/fluid/framework/ir/graph_helper.cc     | 22 ++++++++++++++-----
 .../framework/ir/graph_pattern_detector.cc    | 14 +++++++++---
 .../framework/ir/graph_pattern_detector.h     |  6 +++--
 paddle/fluid/pybind/ir.cc                     |  5 ++++-
 6 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 5bd26e9eb9f2d..67aa5a822edae 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -297,7 +297,18 @@ void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
       }
     }
   }
-  GraphSafeRemoveNodes(graph, need_removed_nodes);
+  details::RemovedVars *saved_removed_nodes = new details::RemovedVars;
+  GraphSafeRemoveNodes(graph, need_removed_nodes, saved_removed_nodes);
+  if (!saved_removed_nodes->empty()) {
+    // TODO(pangyoki): If kRemovedVars exists, merge saved_removed_nodes into
+    // RemovedVars.
+    PADDLE_ENFORCE_EQ(graph->Has(details::kRemovedVars),
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "Removed nodes are only saved for "
+                          "fuse_elewise_add_act_pass in temporary."));
+    graph->Set(details::kRemovedVars, saved_removed_nodes);
+  }
 }
 
 void FuseElewiseAddActPass::ReLinkNodes(Graph *graph,
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 5a954110775d6..3eb2df7011c7e 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -45,6 +45,8 @@ namespace details {
 // This attr is not recommended, because the graph should not dependence
 // the program once it is built.
 constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs";
+constexpr char kRemovedVars[] = "removed_vars";
+typedef std::unordered_set<std::shared_ptr<ir::Node>> RemovedVars;
 }  //  namespace details
 
 namespace ir {
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 80568b7766503..a7bf131805dc1 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -549,6 +549,18 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes,
   }
 }
 
+template <class T = Node *>
+static void GetGraphVarDesc(const Graph &graph,
+                            const std::unordered_set<T> &nodes,
+                            std::vector<proto::VarDesc> *vars) {
+  for (T node : nodes) {
+    if (node->IsVar() && node->Var() &&
+        node->GetVarNodeBlockId() == graph.GetBlockId()) {
+      vars->emplace_back(*node->Var()->Proto());
+    }
+  }
+}
+
 static void GraphToBlock(const Graph &graph,
                          proto::BlockDesc *block,
                          const SortKind *sort_kind) {
@@ -562,11 +574,11 @@ static void GraphToBlock(const Graph &graph,
   }
 
   std::vector<proto::VarDesc> vars_in_graph;
-  for (Node *node : graph.Nodes()) {
-    if (node->IsVar() && node->Var() &&
-        node->GetVarNodeBlockId() == graph.GetBlockId()) {
-      vars_in_graph.emplace_back(*node->Var()->Proto());
-    }
+  GetGraphVarDesc<Node *>(graph, graph.Nodes(), &vars_in_graph);
+  if (graph.Has(details::kRemovedVars)) {
+    auto &removed_vars = graph.Get<details::RemovedVars>(details::kRemovedVars);
+    GetGraphVarDesc<std::shared_ptr<ir::Node>>(
+        graph, removed_vars, &vars_in_graph);
   }
 
   // add vars_in_graph to blcok
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 6191c2efe9087..cce1ec89a2e82 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -771,10 +771,18 @@ bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
   return var->Name() == op->Op()->Output(argument)[nth];
 }
 
-void GraphSafeRemoveNodes(Graph *graph,
-                          const std::unordered_set<const Node *> &nodes) {
+void GraphSafeRemoveNodes(
+    Graph *graph,
+    const std::unordered_set<const Node *> &nodes,
+    std::unordered_set<std::shared_ptr<Node>> *saved_nodes) {
   for (auto *node : nodes) {
-    graph->RemoveNode(const_cast<Node *>(node));
+    if (saved_nodes != nullptr) {
+      // prevent unique_ptr node from being released
+      saved_nodes->insert(
+          std::move(graph->RemoveNode(const_cast<Node *>(node))));
+    } else {
+      graph->RemoveNode(const_cast<Node *>(node));
+    }
   }
 
   for (auto *node : graph->Nodes()) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 00e565b7161a2..794c25e85a555 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -392,8 +392,10 @@ bool HasOutput(Node* op, const std::string& argument);
 bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
 
 // Graph safely remove some nodes, will automatically clean up the edges.
-void GraphSafeRemoveNodes(Graph* graph,
-                          const std::unordered_set<const Node*>& nodes);
+void GraphSafeRemoveNodes(
+    Graph* graph,
+    const std::unordered_set<const Node*>& nodes,
+    std::unordered_set<std::shared_ptr<Node>>* saved_nodes = nullptr);
 
 // Some pre-defined patterns those can be reused in multiple passes.
 // The related Fluid Layer or Op should be one pattern here for better re-usage
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index b8b127201cccd..73f7e9a098c14 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -50,7 +50,10 @@ using pybind11::return_value_policy;
 namespace paddle {
 namespace pybind {
 void BindGraph(py::module *m) {
-  m->def("graph_safe_remove_nodes", GraphSafeRemoveNodes);
+  m->def("graph_safe_remove_nodes",
+         [](Graph *graph, const std::unordered_set<const Node *> &nodes) {
+           return GraphSafeRemoveNodes(graph, nodes);
+         });
   m->def("has_circle", HasCircle);
   m->def("graph_num", GraphNum);
   m->def(

From 8a07d02c50d67791db9e9e50063b1ec25b55866d Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 27 Jul 2022 16:44:34 +0800
Subject: [PATCH 20/28] fix UTs on physical ipu (#44647)

---
 .../fluid/tests/unittests/ipu/op_test_ipu.py  |   2 +-
 .../tests/unittests/ipu/test_cast_op_ipu.py   |  19 ++-
 .../unittests/ipu/test_groupnorm_op_ipu.py    |   4 +-
 .../unittests/ipu/test_interpolate_ops_ipu.py |   2 +-
 .../unittests/ipu/test_meshgrid_op_ipu.py     |   4 +-
 .../ipu/test_mixed_precision_training_ipu.py  |  15 +-
 .../tests/unittests/ipu/test_save_load_ipu.py | 129 +++++++++---------
 .../ipu/test_scaled_optimizer_state_ipu.py    |   4 +
 .../unittests/ipu/test_weight_decay_ipu.py    |   2 +-
 .../unittests/ipu/test_yolo_box_op_ipu.py     |   6 +
 10 files changed, 102 insertions(+), 85 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 90850b56aa657..0226ca4ae7432 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -179,7 +179,7 @@ def wrapper(self, *args, **kwargs):
     @classmethod
     def cast_model_to_fp16(cls, main_program):
         amp_list = paddle.static.amp.CustomOpLists()
-        amp_list.unsupported_list = {}
+        amp_list.unsupported_list = {'scale'}
         to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
             main_program, amp_list, use_fp16_guard=False)
         paddle.static.amp.cast_parameters_to_fp16(
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 6799f4141a416..675489d1e4cd5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -35,7 +35,7 @@ def fp16_enabled(self):
 
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 3, 3, 3])
-        self.feed_fp32 = {'x': data.astype(np.float32)}
+        self.feed_fp32 = {'x': data.astype(np.float16)}
 
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
@@ -44,7 +44,7 @@ def set_feed_attr(self):
 
     def set_op_attrs(self):
         self.attrs = {}
-        self.attrs['dtype'] = 'float16'
+        self.attrs['dtype'] = 'float32'
 
     @IPUOpTest.static_graph
     def build_model(self):
@@ -86,14 +86,19 @@ def set_op_attrs(self):
 
 class TestCase2(TestBase):
 
+    def set_atol(self):
+        super().set_atol()
+        self.atol = 1e-3
+        self.rtol = 1e-3
+
     def set_data_feed(self):
         self.feed_fp32 = {
-            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
         }
 
     def set_op_attrs(self):
         self.attrs = {}
-        self.attrs['dtype'] = 'float32'
+        self.attrs['dtype'] = 'float16'
 
 
 class TestCase3(TestBase):
@@ -145,7 +150,7 @@ def set_op_attrs(self):
 
 
 @unittest.skip('float64 is not supported')
-class TestCase2(TestBase):
+class TestCase7(TestBase):
 
     def set_op_attrs(self):
         self.attrs = {}
@@ -153,7 +158,7 @@ def set_op_attrs(self):
 
 
 @unittest.skip('skip float16 to float32')
-class TestCase3(TestBase):
+class TestCase8(TestBase):
 
     def set_data_feed(self):
         self.feed_fp32 = {
@@ -166,7 +171,7 @@ def set_op_attrs(self):
 
 
 @unittest.skip('int32 to int8 is not supported')
-class TestCase4(TestBase):
+class TestCase9(TestBase):
 
     def set_atol(self):
         super().set_atol()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
index dec4c6e1306a4..70f1a6a28e97b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py
@@ -113,8 +113,8 @@ class TestTrainCase2(TestBase):
     def set_atol(self):
         self.atol = 7e-4
         self.rtol = 1e-6
-        self.atol_fp16 = 4e-3
-        self.rtol_fp16 = 1e-3
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-2
 
     def set_op_attrs(self):
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py
index 70d01e120efc2..108e953659dc5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.set_op_attrs()
 
     def set_data_feed(self):
-        x = np.random.uniform(size=[2, 3, 6, 10])
+        x = np.random.uniform(size=[1, 2, 6, 10])
         self.feed_fp32 = {"x": x.astype(np.float32)}
         self.feed_fp16 = {"x": x.astype(np.float16)}
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py
index 8c3306aed1318..56242fea3672e 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py
@@ -36,8 +36,8 @@ def set_atol(self):
         self.rtol_fp16 = 1e-3
 
     def set_feed(self):
-        data1 = np.random.uniform(size=[100])
-        data2 = np.random.uniform(size=[200])
+        data1 = np.random.uniform(size=[10])
+        data2 = np.random.uniform(size=[20])
         self.feed_fp32 = {
             'x': data1.astype(np.float32),
             'y': data2.astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
index 4524c1103052d..a733a26d60616 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
@@ -31,17 +31,18 @@ def setUp(self):
         self.set_attrs()
 
     def set_atol(self):
-        self.atol = 2e-6
-        self.rtol = 1e-5
+        super().set_atol()
+        self.atol = 1e-6
+        self.rtol = 1e-3
         self.atol_fp16 = 1e-2
-        self.rtol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-1
 
     def set_training(self):
         self.is_training = True
         self.epoch = 20
 
     def set_data_feed(self):
-        data = np.random.uniform(size=[1, 3, 28, 28])
+        data = np.random.uniform(size=[1, 3, 10, 10])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}
 
     def set_feed_attr(self):
@@ -73,7 +74,7 @@ def build_model(self):
 
         # using fp16
         with paddle.static.amp.fp16_guard():
-            x = paddle.static.nn.conv2d(input=x, num_filters=6, filter_size=3)
+            x = paddle.static.nn.conv2d(input=x, num_filters=3, filter_size=3)
             x = paddle.static.nn.batch_norm(x, act='relu')
             x = F.max_pool2d(x, kernel_size=2, stride=2)
 
@@ -82,9 +83,9 @@ def build_model(self):
         loss = paddle.mean(x)
 
         # optimizer
-        optimizer = paddle.optimizer.Adam(learning_rate=1e-2)
+        optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
         optimizer.minimize(loss, self.startup_prog)
-        self.fetch_list = [loss.name]
+        self.fetch_list = [x.name]
 
     def run_model(self, exec_mode):
         # cast model to fp16
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
index 7c6470af3d10b..ea4190e251f44 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py
@@ -45,78 +45,73 @@ def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
-        self.attrs['enable_fp16'] = False
         self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.SGD, learning_rate=1e-1)
 
-    def _test_base(self, save_otherwise_load):
-        scope = paddle.static.Scope()
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = self.SEED
-        startup_prog.random_seed = self.SEED
+    @IPUOpTest.static_graph
+    def build_model(self):
         generator = paddle.fluid.unique_name.UniqueNameGenerator()
-
         with paddle.fluid.unique_name.guard(generator):
-            with paddle.static.scope_guard(scope):
-                with paddle.static.program_guard(main_prog, startup_prog):
-                    x = paddle.static.data(name=self.feed_list[0],
-                                           shape=self.feed_shape[0],
-                                           dtype='float32')
-                    conv1 = paddle.static.nn.conv2d(x,
-                                                    num_filters=3,
-                                                    filter_size=3,
-                                                    bias_attr=False,
-                                                    name='conv2d')
-                    loss = paddle.mean(conv1)
-
-                    # apply optimizer
-                    self.optimizer().minimize(loss)
-                    fetch_list = [loss.name]
-
-                place = paddle.IPUPlace()
-                exe = paddle.static.Executor(place)
-                exe.run(startup_prog)
-
-                if not save_otherwise_load:
-                    paddle.static.load(main_prog, self.attrs['model_path'].name)
-
-                ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.set_graph_config(is_training=True)
-                ipu_strategy.set_precision_config(
-                    enable_fp16=self.attrs['enable_fp16'])
-                ipu_program = paddle.static.IpuCompiledProgram(
-                    main_prog, ipu_strategy=ipu_strategy)
-                program = ipu_program.compile(self.feed_list, fetch_list)
-
-                result = []
-                run_steps = self.attrs['steps'] if save_otherwise_load \
-                    else self.attrs['steps'] - self.attrs['save_at_step']
-
-                feed = self.feed_fp16 if self.attrs[
-                    'enable_fp16'] else self.feed_fp32
-                for i in range(run_steps):
-                    tmp = exe.run(program, feed=feed, fetch_list=fetch_list)
-
-                    if save_otherwise_load and \
-                        i == self.attrs['save_at_step'] - 1:
-                        ipu_program._backend.weights_to_host()
-                        paddle.static.save(main_prog,
-                                           self.attrs['model_path'].name)
-
-                    if save_otherwise_load and i >= self.attrs['save_at_step']:
-                        result.append(tmp)
-                    elif not save_otherwise_load:
-                        result.append(tmp)
-
-                return np.asarray(result).flatten()
+            x = paddle.static.data(name=self.feed_list[0],
+                                   shape=self.feed_shape[0],
+                                   dtype='float32')
+            conv1 = paddle.static.nn.conv2d(x,
+                                            num_filters=3,
+                                            filter_size=3,
+                                            bias_attr=False,
+                                            name='conv2d')
+            loss = paddle.mean(conv1)
+            # apply optimizer
+            self.optimizer().minimize(loss)
+            self.fetch_list = [loss.name]
+
+    def run_model(self, exec_mode, save_otherwise_load):
+        self.build_model()
+
+        place = paddle.IPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(self.startup_prog)
+
+        if not save_otherwise_load:
+            paddle.static.load(self.main_prog, self.attrs['model_path'].name)
+
+        ipu_strategy = paddle.static.IpuStrategy()
+        ipu_strategy.set_graph_config(is_training=True)
+        if self.is_fp16_mode(exec_mode):
+            ipu_strategy.set_precision_config(enable_fp16=True)
+            IPUOpTest.cast_model_to_fp16(self.main_prog)
+        ipu_compiler = paddle.static.IpuCompiledProgram(
+            self.main_prog, ipu_strategy=ipu_strategy)
+        program = ipu_compiler.compile(self.feed_list, self.fetch_list)
+
+        feed = self.feed_fp32
+        if self.is_fp16_mode(exec_mode):
+            feed = self.feed_fp16
+
+        result = []
+        run_steps = self.attrs['steps'] if save_otherwise_load \
+            else self.attrs['steps'] - self.attrs['save_at_step']
+        for i in range(run_steps):
+            tmp = exe.run(program, feed=feed, fetch_list=self.fetch_list)
+
+            if save_otherwise_load and \
+                i == self.attrs['save_at_step'] - 1:
+                ipu_compiler._backend.weights_to_host()
+                paddle.static.save(self.main_prog,
+                                   self.attrs['model_path'].name)
+
+            if save_otherwise_load and i >= self.attrs['save_at_step']:
+                result.append(tmp)
+            elif not save_otherwise_load:
+                result.append(tmp)
+
+        return np.asarray(result)
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
+        res0 = self.run_model(IPUOpTest.ExecutionMode.IPU_FP32, True)
+        res1 = self.run_model(IPUOpTest.ExecutionMode.IPU_FP32, False)
         self.assertTrue(
             np.allclose(res0.flatten(), res1.flatten(), atol=self.atol))
         self.attrs['model_path'].cleanup()
@@ -185,12 +180,18 @@ def set_attrs(self):
         self.attrs = {}
         self.attrs['steps'] = 100
         self.attrs['save_at_step'] = 20
-        self.attrs['enable_fp16'] = True
         self.attrs['model_path'] = tempfile.TemporaryDirectory()
 
     def set_optimizer(self):
         self.optimizer = partial(paddle.optimizer.SGD, learning_rate=1e-1)
 
+    def test_base(self):
+        res0 = self.run_model(IPUOpTest.ExecutionMode.IPU_FP16, True)
+        res1 = self.run_model(IPUOpTest.ExecutionMode.IPU_FP16, False)
+        self.assertTrue(
+            np.allclose(res0.flatten(), res1.flatten(), atol=self.atol))
+        self.attrs['model_path'].cleanup()
+
 
 class TestMomentumFp16(TestSGDFP16):
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
index e1f6f7a23f294..3960be248eca9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py
@@ -28,6 +28,10 @@ def setUp(self):
         self.set_feed_attr()
         self.set_attrs()
 
+    @property
+    def fp16_enabled(self):
+        return False
+
     def set_training(self):
         self.is_training = True
         self.epoch = 100
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
index 7fb467fced752..c121e6358d3d5 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
@@ -83,7 +83,7 @@ def exclude_fn(param):
 
                 loss = paddle.mean(conv1)
                 opt = paddle.optimizer.Lamb(
-                    learning_rate=1e-1,
+                    learning_rate=1e-3,
                     lamb_weight_decay=self.attrs['weight_decay'],
                     exclude_from_weight_decay_fn=exclude_fn)
                 opt.minimize(loss)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py
index b308e8f4f017e..3e19ca2b0c9d2 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_yolo_box_op_ipu.py
@@ -29,6 +29,12 @@ def setUp(self):
         self.set_feed_attr()
         self.set_op_attrs()
 
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-2
+
     def set_data_feed(self):
         data = np.random.uniform(size=[1, 255, 13, 13])
         self.feed_fp32 = {"in_0": data.astype(np.float32)}

From 8bf7cd85e9d77dfcad5c3d4b8618b2c575d022b5 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 27 Jul 2022 16:45:50 +0800
Subject: [PATCH 21/28] [IPU] add more loss ops  (#44646)

* add more loss ops

* add authors

Co-authored-by: Zhaorui Chen <zhaoruic@graphcore.ai>

Co-authored-by: Zhaorui Chen <zhaoruic@graphcore.ai>
---
 .../ipu/popart_canonicalization/loss_ops.cc   | 212 ++++++++++++++++++
 .../tests/unittests/ipu/test_dy2static_ipu.py |  45 +++-
 .../ipu/test_margin_rank_loss_op_ipu.py       |  90 ++++++++
 .../unittests/ipu/test_rank_loss_op_ipu.py    |  76 +++++++
 ...igmoid_cross_entropy_with_logits_op_ipu.py | 102 +++++++++
 5 files changed, 519 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_sigmoid_cross_entropy_with_logits_op_ipu.py

diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
index 035b15b2770a7..48962456d4ca7 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc
@@ -278,6 +278,123 @@ Node *kldiv_loss_handler(Graph *graph, Node *node) {
   return loss;
 }
 
+Node *sigmoid_cross_entropy_with_logits_handler(Graph *graph, Node *node) {
+  // Out = max(logits, 0) - logits * label + log(1 + exp(-abs(logits)))
+  auto *op = node->Op();
+  int reduction = 2;
+  if (is_dynamic_graph()) {
+    reduction = RemoveTailReduction(graph, node, "Out");
+  }
+  bool append_identity_loss =
+      is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Out", node));
+
+  auto logits = GetInputVarNode("X", node);
+  auto label = GetInputVarNode("Label", node);
+  // sigmoid_cross_entropy_with_logits uses float label as input.
+  auto ignore_index_value =
+      static_cast<float>(PADDLE_GET_CONST(int, op->GetAttr("ignore_index")));
+  auto normalize = PADDLE_GET_CONST(bool, op->GetAttr("normalize"));
+
+  // const
+  auto one = CreateConst(
+                 graph, node, std::vector<float>{1.0}, {1}, GetVarDType(logits))
+                 ->outputs.front();
+  auto zero =
+      CreateConst(
+          graph, node, std::vector<float>{0.0}, {1}, GetVarDType(logits))
+          ->outputs.front();
+  auto ignore_index = CreateConst(graph,
+                                  node,
+                                  std::vector<float>{ignore_index_value},
+                                  {1},
+                                  GetVarDType(label))
+                          ->outputs.front();
+  // max(logits, 0)
+  auto max_zero =
+      CreateBaseOp(graph, node, "popart_max", {logits, zero}, {}, {})
+          ->outputs.front();
+
+  // logits * label
+  auto mul = CreateBaseOp(graph, node, "popart_mul", {logits, label}, {}, {})
+                 ->outputs.front();
+
+  // abs(logits)
+  auto abs = CreateBaseOp(graph, node, "popart_abs", {logits}, {}, {})
+                 ->outputs.front();
+  // -abs(logits)
+  auto neg_abs =
+      CreateBaseOp(graph, node, "popart_neg", {abs}, {}, {})->outputs.front();
+  // exp(-abs(logits))
+  auto exp_neg_abs = CreateBaseOp(graph, node, "popart_exp", {neg_abs}, {}, {})
+                         ->outputs.front();
+  // 1+exp(-abs(logits))
+  auto log_term =
+      CreateBaseOp(graph, node, "popart_add", {exp_neg_abs, one}, {}, {})
+          ->outputs.front();
+  // log(1+exp(-abs(logits)))
+  auto log = CreateBaseOp(graph, node, "popart_log", {log_term}, {}, {})
+                 ->outputs.front();
+
+  // max(logits, 0) - logits * label
+  auto sub = CreateBaseOp(graph, node, "popart_sub", {max_zero, mul}, {}, {})
+                 ->outputs.front();
+  // max(logits, 0) - logits * label + log(1 + exp(-abs(logits)))
+  auto loss = CreateBaseOp(graph, node, "popart_add", {sub, log}, {}, {})
+                  ->outputs.front();
+
+  // label == ignore_index ? 0 : loss
+  auto equal_cond =
+      CreateBaseOp(graph, node, "popart_equal", {label, ignore_index}, {}, {})
+          ->outputs.front();
+  loss = CreateBaseOp(graph,
+                      node,
+                      "popart_where",
+                      {equal_cond, zero, loss},
+                      append_identity_loss || normalize
+                          ? std::vector<Node *>{}
+                          : std::vector<Node *>{GetOutputVarNode("Out", node)},
+                      {});
+
+  if (normalize) {
+    // normalize the output as: loss = loss / sum(label != ignore_index)
+    auto not_equal =
+        CreateBaseOp(graph, node, "popart_logical_not", {equal_cond}, {}, {})
+            ->outputs.front();
+    auto mask =
+        CreateCast(graph, node, {not_equal}, {}, logits->Var()->GetDataType())
+            ->outputs.front();
+    auto sum = CreateBaseOp(graph,
+                            node,
+                            "popart_reducesum",
+                            {mask},
+                            {},
+                            {{"keepdims", int64_t{0}}})
+                   ->outputs.front();
+    auto eps =
+        CreateConst(
+            graph, node, std::vector<float>{1e-5}, {1}, GetVarDType(logits))
+            ->outputs.front();
+    // avoid division by zero
+    auto add_eps = CreateBaseOp(graph, node, "popart_add", {sum, eps}, {}, {})
+                       ->outputs.front();
+    loss =
+        CreateBaseOp(graph,
+                     node,
+                     "popart_div",
+                     {loss->outputs[0], add_eps},
+                     append_identity_loss
+                         ? std::vector<Node *>{}
+                         : std::vector<Node *>{GetOutputVarNode("Out", node)},
+                     {});
+  }
+
+  if (append_identity_loss) {
+    loss = CreateIdentityLossOp(
+        graph, node, loss->outputs, {GetOutputVarNode("Out", node)}, reduction);
+  }
+  return loss;
+}
+
 Node *binary_cross_entropy_handler(Graph *graph, Node *node) {
   // Out = -1 * weight * (label * log(x) + (1 - label) * log(1 - x))
   int reduction = 2;
@@ -493,6 +610,97 @@ Node *warpctc_handler(Graph *graph, Node *node) {
   return loss;
 }
 
+Node *rank_loss_handler(Graph *graph, Node *node) {
+  // (1.0f + (left - right).exp()).log() - label * (left - right)
+  auto label = GetInputVarNode("Label", node);
+  auto left = GetInputVarNode("Left", node);
+  auto right = GetInputVarNode("Right", node);
+  auto output = GetOutputVarNode("Out", node);
+  int reduction = 2;
+  if (is_dynamic_graph()) {
+    reduction = RemoveTailReduction(graph, node, "Out");
+  }
+  bool append_identity_loss = is_dynamic_graph() && IsLastVarNode(output);
+
+  auto sub = CreateBaseOp(graph, node, "popart_sub", {left, right}, {}, {})
+                 ->outputs.front();
+  auto mul = CreateBaseOp(graph, node, "popart_mul", {label, sub}, {}, {})
+                 ->outputs.front();
+  // const
+  auto one =
+      CreateConst(graph, node, std::vector<float>{1.0}, {1}, GetVarDType(label))
+          ->outputs.front();
+  auto exp =
+      CreateBaseOp(graph, node, "popart_exp", {sub}, {}, {})->outputs.front();
+  auto add = CreateBaseOp(graph, node, "popart_add", {one, exp}, {}, {})
+                 ->outputs.front();
+  auto log =
+      CreateBaseOp(graph, node, "popart_log", {add}, {}, {})->outputs.front();
+  auto loss = CreateBaseOp(graph,
+                           node,
+                           "popart_sub",
+                           {log, mul},
+                           append_identity_loss ? std::vector<Node *>{}
+                                                : std::vector<Node *>{output},
+                           {})
+                  ->outputs.front();
+  if (append_identity_loss) {
+    loss =
+        CreateIdentityLossOp(graph, node, loss->outputs, {output}, reduction);
+  }
+  return loss;
+}
+
+Node *margin_rank_loss_handler(Graph *graph, Node *node) {
+  // rank_loss = max(0, -label * (left - right) + margin)
+  auto *op = node->Op();
+  auto label = GetInputVarNode("Label", node);
+  auto left = GetInputVarNode("X1", node);
+  auto right = GetInputVarNode("X2", node);
+  auto output = GetOutputVarNode("Out", node);
+  auto margin_value = PADDLE_GET_CONST(float, op->GetAttr("margin"));
+  int reduction = 2;
+  if (is_dynamic_graph()) {
+    reduction = RemoveTailReduction(graph, node, "Out");
+  }
+  bool append_identity_loss = is_dynamic_graph() && IsLastVarNode(output);
+
+  // -(left - right)
+  auto sub = CreateBaseOp(graph, node, "popart_sub", {right, left}, {}, {})
+                 ->outputs.front();
+  // -label * (left - right)
+  auto mul = CreateBaseOp(graph, node, "popart_mul", {label, sub}, {}, {})
+                 ->outputs.front();
+  // const
+  auto zero =
+      CreateConst(graph, node, std::vector<float>{0.0}, {1}, GetVarDType(label))
+          ->outputs.front();
+  auto margin = CreateConst(graph,
+                            node,
+                            std::vector<float>{margin_value},
+                            {1},
+                            GetVarDType(label))
+                    ->outputs.front();
+  auto margin_add =
+      CreateBaseOp(graph, node, "popart_add", {mul, margin}, {}, {})
+          ->outputs.front();
+
+  // max(0, term)
+  auto loss = CreateBaseOp(graph,
+                           node,
+                           "popart_max",
+                           {zero, margin_add},
+                           append_identity_loss ? std::vector<Node *>{}
+                                                : std::vector<Node *>{output},
+                           {})
+                  ->outputs.front();
+  if (append_identity_loss) {
+    loss =
+        CreateIdentityLossOp(graph, node, loss->outputs, {output}, reduction);
+  }
+  return loss;
+}
+
 }  // namespace
 }  // namespace ipu
 }  // namespace platform
@@ -502,7 +710,11 @@ REGISTER_HANDLER(identity_loss, identity_loss_handler);
 REGISTER_HANDLER(softmax_with_cross_entropy,
                  softmax_with_cross_entropy_handler);
 REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+REGISTER_HANDLER(sigmoid_cross_entropy_with_logits,
+                 sigmoid_cross_entropy_with_logits_handler);
 REGISTER_HANDLER(kldiv_loss, kldiv_loss_handler);
 REGISTER_HANDLER(bce_loss, binary_cross_entropy_handler);
 REGISTER_HANDLER(huber_loss, huber_loss_handler);
 REGISTER_HANDLER(warpctc, warpctc_handler);
+REGISTER_HANDLER(rank_loss, rank_loss_handler);
+REGISTER_HANDLER(margin_rank_loss, margin_rank_loss_handler);
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
index 7b581de222819..8b3e0104c29cc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
@@ -70,8 +70,8 @@ def set_op_attrs(self):
         self.loss_op = paddle.fluid.layers.cross_entropy
 
     def set_data_feed(self):
-        self.data = paddle.uniform((32, 3, 10, 10), dtype='float32')
-        self.label = paddle.randint(0, 10, shape=[32], dtype='int64')
+        self.data = paddle.uniform((8, 3, 10, 10), dtype='float32')
+        self.label = paddle.randint(0, 10, shape=[8], dtype='int64')
 
     def create_model(self, use_ipu=False):
         return SimpleLayer(loss_op=self.loss_op,
@@ -215,8 +215,8 @@ def set_op_attrs(self):
         self.loss_op = paddle.fluid.layers.softmax_with_cross_entropy
 
     def set_data_feed(self):
-        self.data = paddle.uniform((32, 3, 10, 10), dtype='float32')
-        self.label = paddle.randint(0, 10, shape=[32, 1], dtype='int64')
+        self.data = paddle.uniform((8, 3, 10, 10), dtype='float32')
+        self.label = paddle.randint(0, 10, shape=[8, 1], dtype='int64')
 
     def create_model(self, use_ipu=False):
         return SimpleLayer(loss_op=self.loss_op,
@@ -231,8 +231,41 @@ def set_op_attrs(self):
         self.loss_op = partial(paddle.fluid.layers.kldiv_loss, reduction="none")
 
     def set_data_feed(self):
-        self.data = paddle.uniform((32, 3, 10, 10), dtype='float32')
-        self.label = paddle.rand(shape=[32, 81], dtype='float32')
+        self.data = paddle.uniform((8, 3, 10, 10), dtype='float32')
+        self.label = paddle.rand(shape=[8, 81], dtype='float32')
+
+    def create_model(self, use_ipu=False):
+        return SimpleLayer(loss_op=self.loss_op,
+                           use_softmax=True,
+                           use_reduction=True,
+                           use_identity_loss=False)
+
+
+class TestWithoutIdentityLoss4(TestBase):
+
+    def set_op_attrs(self):
+        self.loss_op = paddle.nn.functional.binary_cross_entropy
+
+    def set_data_feed(self):
+        self.data = paddle.uniform((8, 3, 10, 10), dtype='float32')
+        self.label = paddle.rand(shape=[8, 81], dtype='float32')
+
+    def create_model(self, use_ipu=False):
+        return SimpleLayer(loss_op=self.loss_op,
+                           use_softmax=True,
+                           use_reduction=False,
+                           use_identity_loss=False)
+
+
+class TestWithoutIdentityLoss5(TestBase):
+
+    def set_op_attrs(self):
+        self.loss_op = paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+
+    def set_data_feed(self):
+        self.data = paddle.uniform((8, 3, 10, 10), dtype='float32')
+        self.label = paddle.randint(0, 10, shape=[8, 81],
+                                    dtype='int64').astype('float32')
 
     def create_model(self, use_ipu=False):
         return SimpleLayer(loss_op=self.loss_op,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py
new file mode 100644
index 0000000000000..e9964156a128b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_margin_rank_loss_op_ipu.py
@@ -0,0 +1,90 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        label = np.random.uniform(size=[3, 1])
+        left = np.random.uniform(size=[3, 1])
+        right = np.random.uniform(size=[3, 1])
+        self.feed_fp32 = {
+            "label": label.astype(np.float32),
+            "left": left.astype(np.float32),
+            "right": right.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "label": label.astype(np.float16),
+            "left": left.astype(np.float16),
+            "right": right.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'margin': 0.1,
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        label = paddle.static.data(name=self.feed_list[0],
+                                   shape=self.feed_shape[0],
+                                   dtype="float32")
+        left = paddle.static.data(name=self.feed_list[1],
+                                  shape=self.feed_shape[1],
+                                  dtype='float32')
+        right = paddle.static.data(name=self.feed_list[2],
+                                   shape=self.feed_shape[2],
+                                   dtype='float32')
+        out = paddle.fluid.layers.margin_rank_loss(label, left, right)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'margin': 0.5,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py
new file mode 100644
index 0000000000000..ad3bbde11923a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_rank_loss_op_ipu.py
@@ -0,0 +1,76 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    def set_data_feed(self):
+        label = np.random.uniform(size=[3, 1])
+        left = np.random.uniform(size=[3, 1])
+        right = np.random.uniform(size=[3, 1])
+        self.feed_fp32 = {
+            "label": label.astype(np.float32),
+            "left": left.astype(np.float32),
+            "right": right.astype(np.float32),
+        }
+        self.feed_fp16 = {
+            "label": label.astype(np.float16),
+            "left": left.astype(np.float16),
+            "right": right.astype(np.float16),
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        label = paddle.static.data(name=self.feed_list[0],
+                                   shape=self.feed_shape[0],
+                                   dtype="float32")
+        left = paddle.static.data(name=self.feed_list[1],
+                                  shape=self.feed_shape[1],
+                                  dtype='float32')
+        right = paddle.static.data(name=self.feed_list[2],
+                                   shape=self.feed_shape[2],
+                                   dtype='float32')
+        out = paddle.fluid.layers.rank_loss(label, left, right)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sigmoid_cross_entropy_with_logits_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sigmoid_cross_entropy_with_logits_op_ipu.py
new file mode 100644
index 0000000000000..997ae46ec82cb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_sigmoid_cross_entropy_with_logits_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+import paddle.nn.functional as F
+
+
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[10])
+        label = np.arange(10).reshape([10])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.float16)
+        }
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'ignore_index': -100,
+        }
+
+    @IPUOpTest.static_graph
+    def build_model(self, on_ipu):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype="float32")
+        label = paddle.static.data(name=self.feed_list[1],
+                                   shape=self.feed_shape[1],
+                                   dtype='float32')
+        out = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(
+            x, label, **self.attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model(self.is_ipu_mode(m))
+                self.run_model(m)
+        self.check()
+
+
+class TestCase1(TestBase):
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'ignore_index': 1,
+        }
+
+
+class TestCase2(TestBase):
+
+    def set_atol(self):
+        # epsilon is added when normalize is True, use larger atol.
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
+        self.attrs = {
+            'ignore_index': 1,
+            'normalize': True,
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()

From a6d05771c29de11e4e3165c57d1cad4738cb494a Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 27 Jul 2022 16:46:08 +0800
Subject: [PATCH 22/28] add g_ipuplace_pytype (#44648)

---
 paddle/fluid/pybind/place.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 217edf96f1343..63d98c3474240 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -200,6 +200,7 @@ PyTypeObject *g_xpuplace_pytype = nullptr;
 PyTypeObject *g_npuplace_pytype = nullptr;
 PyTypeObject *g_cudapinnedplace_pytype = nullptr;
 PyTypeObject *g_mluplace_pytype = nullptr;
+PyTypeObject *g_ipuplace_pytype = nullptr;
 
 template <typename PlaceType>
 static inline int PlaceIndex(const PlaceType &p) {  // NOLINT
@@ -611,7 +612,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
       .def("__str__", string::to_string<const platform::NPUPlace &>);
 
   // IPUPlace
-  py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
+  py::class_<platform::IPUPlace> ipuplace(m, "IPUPlace", R"DOC(
     IPUPlace is a descriptor of a device.
     It represents a IPU device on which a tensor will be allocated and a model will run.
 
@@ -623,7 +624,9 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 
           ipu_place = paddle.IPUPlace()
 
-        )DOC")
+        )DOC");
+  g_ipuplace_pytype = reinterpret_cast<PyTypeObject *>(ipuplace.ptr());
+  ipuplace
       .def("__init__",
            [](platform::IPUPlace &self) {
 #ifdef PADDLE_WITH_IPU

From 84d595faecddb9a3e53b007f63bde5b4aac6c390 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:03:45 +0800
Subject: [PATCH 23/28] Strided slice fp16 (#44653)

---
 paddle/phi/kernels/gpu/strided_slice_kernel.cu | 2 ++
 paddle/phi/kernels/strided_slice_kernel.cc     | 1 +
 2 files changed, 3 insertions(+)

diff --git a/paddle/phi/kernels/gpu/strided_slice_kernel.cu b/paddle/phi/kernels/gpu/strided_slice_kernel.cu
index 716150ff47dea..786ccb287c271 100644
--- a/paddle/phi/kernels/gpu/strided_slice_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_slice_kernel.cu
@@ -28,6 +28,7 @@ PD_REGISTER_KERNEL(strided_slice_raw,
                    int64_t,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -40,5 +41,6 @@ PD_REGISTER_KERNEL(strided_slice_array,
                    int64_t,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 547d574cd78d0..037abf461a306 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -55,6 +55,7 @@ PD_REGISTER_KERNEL(strided_slice,
                    int64_t,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 #endif

From f49b0cb9f26df7966881e4d6953d735ba443f719 Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Wed, 27 Jul 2022 17:16:53 +0800
Subject: [PATCH 24/28] [MLU]fix sync_batch_norm and concat_grad op (#44586)

---
 paddle/fluid/operators/concat_op_mlu.cc       |  9 +++++++--
 .../fluid/operators/sync_batch_norm_op_mlu.cc |  9 ++++++++-
 .../unittests/mlu/sync_batch_norm_op_mlu.py   |  4 ++--
 .../mlu/test_sync_batch_norm_base_mlu.py      |  4 ++--
 .../mlu/test_sync_batch_norm_op_mlu.sh        | 19 +++++++++++++++++++
 .../test_sync_batch_norm_op_mlu_baseline.py   |  2 +-
 .../mlu/test_sync_batch_norm_op_mlu_extra.py  |  5 +++--
 7 files changed, 42 insertions(+), 10 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu.sh

diff --git a/paddle/fluid/operators/concat_op_mlu.cc b/paddle/fluid/operators/concat_op_mlu.cc
index f254228f8a097..a4cc1c37db0cf 100644
--- a/paddle/fluid/operators/concat_op_mlu.cc
+++ b/paddle/fluid/operators/concat_op_mlu.cc
@@ -121,6 +121,7 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
             out_grad->dims().size()));
     // get output tensor that the name is not kEmptyVarName
     std::vector<void*> outputs_vec;
+    std::vector<Tensor> tmp_outputs_vec;
     std::vector<MLUCnnlTensorDesc> output_descs;
     std::vector<cnnlTensorDescriptor_t> descs_vec;
     for (size_t j = 0; j < outs.size(); ++j) {
@@ -128,11 +129,15 @@ class ConcatGradMLUKernel : public framework::OpKernel<T> {
           outs[j]->numel() != 0UL) {
         outs[j]->mutable_data<T>(ctx.GetPlace());
         output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
-        descs_vec.push_back(output_descs.back().get());
         outputs_vec.push_back(GetBasePtr(outs[j]));
       } else {
-        outputs_vec.push_back(nullptr);
+        Tensor tmp_tensor;
+        tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
+        tmp_outputs_vec.push_back(tmp_tensor);
+        output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));
+        outputs_vec.push_back(GetBasePtr(&(tmp_outputs_vec.back())));
       }
+      descs_vec.push_back(output_descs.back().get());
     }
 
     MLUCnnlTensorDesc out_grad_desc(*out_grad);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
index ce511a12bbfdb..a2091aa10a73b 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
@@ -23,7 +23,9 @@ limitations under the Licnse. */
 namespace paddle {
 namespace operators {
 
+#define NO_USE_CNCL 0
 #define GET_LAYOUT_OFFSET 2
+
 using Tensor = framework::Tensor;
 static std::vector<cnnlTensorLayout_t> supported_input_layout = {
     CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
@@ -165,6 +167,7 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
       Tensor mean_all(mean->dtype());
       Tensor invstd_all(variance->dtype());
 
+#ifdef PADDLE_WITH_CNCL
       auto &dev_ctx =
           ctx.template device_context<paddle::platform::MLUDeviceContext>();
       auto stream = dev_ctx.stream();
@@ -205,7 +208,9 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
                                                  cncl_dtype,
                                                  comm,
                                                  stream));
-
+#else
+      if (NO_USE_CNCL) {
+#endif
       } else {
         count_all = input_count;
         mean_all.ShareDataWith(local_mean);
@@ -404,6 +409,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
     FillMLUTensorWithHostValue<int32_t>(
         ctx, static_cast<int32_t>(x->numel() / C), &numel_count);
 
+#ifdef PADDLE_WITH_CNCL
     auto &dev_ctx =
         ctx.template device_context<paddle::platform::MLUDeviceContext>();
     auto stream = dev_ctx.stream();
@@ -440,6 +446,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
                                                comm,
                                                stream));
     }
+#endif
 
     if (d_x) {
       MLUCnnlTensorDesc desc_count(numel_count);
diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
index 4f80523a18254..5c1b8b602f269 100644
--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -35,9 +35,9 @@
 import paddle.fluid.layers as layers
 from functools import reduce
 from test_sync_batch_norm_base_mlu import TestSyncBatchNormRunnerBase, runtime_main
-from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest, _set_use_system_allocator
 
-from paddle.fluid.tests.unittests.test_sync_batch_norm_op import create_or_get_tensor
+from test_sync_batch_norm_op import create_or_get_tensor
 
 _set_use_system_allocator(False)
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
index 3081ee9d38754..3c774e47010f9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
@@ -33,9 +33,9 @@
 from six import string_types
 import paddle
 
-from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest, _set_use_system_allocator
 
-from paddle.fluid.tests.unittests.test_sync_batch_norm_op import create_or_get_tensor
+from test_sync_batch_norm_op import create_or_get_tensor
 
 _set_use_system_allocator(False)
 paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu.sh
new file mode 100644
index 0000000000000..1417acb4be516
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch test_sync_batch_norm_op_mlu_baseline.py
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
index ac3f686cb8fe2..f524e47b54a92 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
@@ -20,7 +20,7 @@
 import sys
 
 sys.path.append("..")
-from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
+from op_test import OpTest, _set_use_system_allocator
 
 from test_sync_batch_norm_base_mlu import TestDistBase
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py
index 955d9a122a292..400d2f4afed46 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py
@@ -29,8 +29,9 @@
 import paddle.nn as nn
 from paddle.fluid import Program, program_guard
 
-from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator
-from paddle.fluid.tests.unittests.test_dist_base import TestDistBase
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+from test_dist_base import TestDistBase
 
 paddle.enable_static()
 

From 5be7a1ffc82fa805214217fa7610661c22259402 Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:29:09 +0800
Subject: [PATCH 25/28] retain dist op returns (#44634)

---
 .../paddle/distributed/auto_parallel/dist_op.py  |  4 +---
 .../tests/unittests/auto_parallel/engine_api.py  |  4 ++--
 .../tests/unittests/auto_parallel_gpt_model.py   | 16 ++++++++--------
 .../unittests/test_auto_parallel_reshard_mppp.py |  8 +-------
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index d48804b71fc3e..b6a77b778885f 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -267,6 +267,4 @@ def __call__(self, *args, **kwargs):
             dist_op = DistributedOperator(op, self._dist_attr)
             dist_op.dist_attr.mark_annotated_as(self._dist_attr)
             default_dist_ctx.add_dist_op_for_program(dist_op)
-        if isinstance(output, Variable):
-            output = [output]
-        return list(output)
+        return output
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index ec757c03478de..9335d7d9d2e03 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -89,11 +89,11 @@ def __init__(self,
 
     def forward(self, input):
         out = auto.shard_op(self.norm, dist_attr={"process_mesh":
-                                                  PP_MESH_0})(input)[0]
+                                                  PP_MESH_0})(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
         out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
-                                                     PP_MESH_1})(out)[0]
+                                                     PP_MESH_1})(out)
         out = self.dropout(out)
         out = self.linear2(out)
         self.out = out
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
index 4695f6a4a9425..87c746ab5d3b5 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
@@ -391,7 +391,7 @@ def forward(self,
                             mod,
                             dist_attr={
                                 "process_mesh": PP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                            })(output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -405,7 +405,7 @@ def forward(self,
                             mod,
                             dist_attr={
                                 "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                            })(output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -419,7 +419,7 @@ def forward(self,
                             mod,
                             dist_attr={
                                 "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                            })(output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -433,7 +433,7 @@ def forward(self,
                             mod,
                             dist_attr={
                                 "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                            })(output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -456,7 +456,7 @@ def forward(self,
                                                    "process_mesh":
                                                    PP_MESH_LIST[mod.mesh_idx]
                                                })(output, memory, tgt_mask,
-                                                  use_cache, cache)[0]
+                                                  use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -471,7 +471,7 @@ def forward(self,
                                                    "process_mesh":
                                                    DPPP_MESH_LIST[mod.mesh_idx]
                                                })(output, memory, tgt_mask,
-                                                  use_cache, cache)[0]
+                                                  use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -486,7 +486,7 @@ def forward(self,
                                                    "process_mesh":
                                                    MPPP_MESH_LIST[mod.mesh_idx]
                                                })(output, memory, tgt_mask,
-                                                  use_cache, cache)[0]
+                                                  use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
@@ -500,7 +500,7 @@ def forward(self,
                             mod,
                             dist_attr={
                                 "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)[0]
+                            })(output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
                             output,
                             dist_attr={
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 0e647a3db5b64..dfb314796a9ff 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -255,12 +255,6 @@ def test_allgather(self):
                                       "dims_mapping": [-1, -1]
                                   })
 
-            # y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
-            #     x.name: [-1, -1],
-            #     w.name: [-1, -1]
-            # }, **{"x": x,
-            #       "y": w})[0]
-
             y = paddle.distributed.shard_op(paddle.matmul,
                                             dist_attr={
                                                 "process_mesh": process_mesh,
@@ -270,7 +264,7 @@ def test_allgather(self):
                                                 w: {
                                                     "dims_mapping": [-1, -1]
                                                 }
-                                            })(x, w)[0]
+                                            })(x, w)
 
         rank_id = 0
         dist_context = DistributedContext()

From ae25ab5629b2229e740047589ba818e2e88e4a44 Mon Sep 17 00:00:00 2001
From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com>
Date: Wed, 27 Jul 2022 17:36:54 +0800
Subject: [PATCH 26/28] xpu unittest grad compute supports more types,
 *test=kunlun (#44606)

---
 .../fluid/tests/unittests/op_test_xpu.py      | 83 ++++++++++++++++++-
 .../unittests/xpu/test_flatten2_op_xpu.py     |  4 +-
 .../test_flatten_contiguous_range_op_xpu.py   |  4 +-
 .../unittests/xpu/test_flatten_op_xpu.py      |  4 +-
 4 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index cabfec949fe1e..48c8893b4cea9 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -32,13 +32,13 @@
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from paddle.fluid.framework import Program, OpProtoHolder, Variable, convert_np_dtype_to_dtype_
 from testsuite import create_op, set_input, append_input_output, append_loss_ops
 from paddle.fluid import unique_name
 from white_list import op_accuracy_white_list, check_shape_white_list, compile_vs_runtime_white_list, no_check_set_white_list
 from white_list import op_threshold_white_list, no_grad_set_white_list
 from op_test import OpTest, _set_use_system_allocator, get_numeric_gradient
-from xpu.get_test_cover_info import is_empty_grad_op_type
+from xpu.get_test_cover_info import is_empty_grad_op_type, get_xpu_op_support_types, type_dict_str_to_numpy
 
 
 class XPUOpTest(OpTest):
@@ -66,6 +66,10 @@ def is_empty_grad_op(op_type):
             place = paddle.XPUPlace(0)
             if core.is_float16_supported(place) == False:
                 return
+
+        if cls.dtype == np.float64:
+            return
+
         super().tearDownClass()
 
     def _get_places(self):
@@ -144,6 +148,14 @@ def check_grad_with_place(self,
                 self._check_grad_helper()
                 return
 
+        cast_grad_op_types = get_xpu_op_support_types('cast')
+        cast_grad_op_types_np = []
+        for ctype in cast_grad_op_types:
+            cast_grad_op_types_np.append(type_dict_str_to_numpy[ctype])
+
+        if (self.dtype not in cast_grad_op_types_np):
+            return
+
         if self.dtype == np.float64:
             return
 
@@ -212,6 +224,11 @@ def get_grad_with_place(self,
             op_attrs["use_mkldnn"] = False
             use_onednn = True
 
+        mean_grad_op_types = get_xpu_op_support_types('mean')
+        mean_grad_op_types_np = []
+        for mtype in mean_grad_op_types:
+            mean_grad_op_types_np.append(type_dict_str_to_numpy[mtype])
+
         self.op = create_op(self.scope,
                             self.op_type,
                             op_inputs,
@@ -238,6 +255,68 @@ def get_grad_with_place(self,
         if not type(output_names) is list:
             output_names = [output_names]
 
+        if (self.dtype not in mean_grad_op_types_np):
+
+            prog = Program()
+            block = prog.global_block()
+            scope = core.Scope()
+            self._append_ops(block)
+
+            inputs = self._get_inputs(block)
+            outputs = self._get_outputs(block)
+            feed_dict = self.feed_var(inputs, place)
+            cast_inputs = list(map(block.var, output_names))
+            cast_outputs = block.create_var(dtype="float32",
+                                            shape=cast_inputs[0].shape)
+            cast_op = block.append_op(type="cast",
+                                      inputs={"X": cast_inputs},
+                                      outputs={"Out": cast_outputs},
+                                      attrs={
+                                          "in_dtype":
+                                          convert_np_dtype_to_dtype_(
+                                              self.dtype),
+                                          "out_dtype":
+                                          core.VarDesc.VarType.FP32
+                                      })
+            cast_op.desc.infer_var_type(block.desc)
+            cast_op.desc.infer_shape(block.desc)
+
+            output_names = [cast_outputs.name]
+
+            loss = append_loss_ops(block, output_names)
+            loss_names = [loss.name]
+            recast_inputs = list(map(block.var, loss_names))
+            recast_loss = block.create_var(dtype=self.dtype,
+                                           shape=recast_inputs[0].shape)
+
+            recast_op = block.append_op(type="cast",
+                                        inputs={"X": recast_inputs},
+                                        outputs={"Out": recast_loss},
+                                        attrs={
+                                            "in_dtype":
+                                            core.VarDesc.VarType.FP32,
+                                            "out_dtype":
+                                            convert_np_dtype_to_dtype_(
+                                                self.dtype)
+                                        })
+            recast_op.desc.infer_var_type(block.desc)
+            recast_op.desc.infer_shape(block.desc)
+
+            param_grad_list = append_backward(loss=recast_loss,
+                                              parameter_list=[input_to_check],
+                                              no_grad_set=no_grad_set)
+            fetch_list = [g for p, g in param_grad_list]
+
+            executor = fluid.Executor(place)
+            return list(
+                map(
+                    np.array,
+                    executor.run(prog,
+                                 feed_dict,
+                                 fetch_list,
+                                 scope=scope,
+                                 return_numpy=False)))
+
         analytic_grads = self._get_gradient(
             inputs_to_check,
             place,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
index 392eed198ff95..c1b54c247d63a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
@@ -93,10 +93,8 @@ def init_test_case(self):
 
 
 support_types = get_xpu_op_support_types('flatten2')
-support_types_for_grad = get_xpu_op_support_types('mean')
 for stype in support_types:
-    if stype in support_types_for_grad:
-        create_test_class(globals(), XPUTestFlatten2Op, stype)
+    create_test_class(globals(), XPUTestFlatten2Op, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
index c9426f54b1cf6..3b347f22e1efc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
@@ -337,10 +337,8 @@ def test_Negative():
 
 
 support_types = get_xpu_op_support_types('flatten_contiguous_range')
-support_types_for_grad = get_xpu_op_support_types('mean')
 for stype in support_types:
-    if stype in support_types_for_grad:
-        create_test_class(globals(), XPUTestFlattenOp, stype)
+    create_test_class(globals(), XPUTestFlattenOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
index c3c732fa77177..cd86c9781ad17 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
@@ -87,10 +87,8 @@ def init_test_case(self):
 
 
 support_types = get_xpu_op_support_types('flatten')
-support_types_for_grad = get_xpu_op_support_types('mean')
 for stype in support_types:
-    if stype in support_types_for_grad:
-        create_test_class(globals(), XPUTestFlattenOp, stype)
+    create_test_class(globals(), XPUTestFlattenOp, stype)
 
 if __name__ == "__main__":
     unittest.main()

From ea91ca2ff6d0f80fd372dba5bf7f3956826bdad5 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 27 Jul 2022 19:24:06 +0800
Subject: [PATCH 27/28] [Eager] Add hierarchical_sigmoid yaml (#44638)

---
 paddle/phi/api/yaml/generator/api_base.py     |  2 +-
 paddle/phi/api/yaml/legacy_api.yaml           | 12 +++++
 paddle/phi/api/yaml/legacy_backward.yaml      | 13 +++++-
 .../fluid/tests/unittests/test_hsigmoid_op.py | 45 ++++++++++++++++---
 python/paddle/nn/functional/loss.py           |  6 ++-
 5 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
index 2659d80615f2d..833eadcf9d110 100644
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -135,7 +135,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]):
             'double': 'double',
             'bool': 'bool',
             'str': 'const std::string&',
-            'str[] ': 'const std::vector<std::string>&',
+            'str[]': 'const std::vector<std::string>&',
             'Place': 'const Place&',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml
index 6a4afd3d0626b..53514ca56691f 100644
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_api.yaml
@@ -1038,6 +1038,18 @@
     func : hard_swish
   backward : hard_swish_grad
 
+# hierarchical_sigmoid
+- api : hierarchical_sigmoid
+  args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse)
+  output : Tensor(out), Tensor(pre_out), Tensor(w_out)
+  infer_meta :
+    func : HierarchicalSigmoidInferMeta
+  optional: path, code, bias
+  kernel :
+    func : hierarchical_sigmoid
+    data_type : x
+  backward : hierarchical_sigmoid_grad
+
 # histogram
 - api : histogram
   args : (Tensor x, int64_t bins, int min, int max)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 9d73c044dbac9..bbcb6e347e516 100644
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -935,6 +935,17 @@
     func : hard_swish_grad
   inplace : (out_grad -> x_grad)
 
+- backward_api : hierarchical_sigmoid_grad
+  forward : hierarchical_sigmoid (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out)
+  args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse)
+  output : Tensor(x_grad), Tensor(w_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x ,w, bias]
+  optional: path, code, bias
+  kernel :
+    func : hierarchical_sigmoid_grad
+
 - backward_api : huber_loss_grad
   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
   args : (Tensor residual, Tensor out_grad, float delta)
@@ -2396,7 +2407,7 @@
   backward : unsqueeze_double_grad
 
 - backward_api : warpctc_grad
-  forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad) 
+  forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
   args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)
   output : Tensor(logits_grad)
   infer_meta :
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index fc8b0d114d5ac..5c5c15cc3c433 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -172,10 +172,30 @@ def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias,
     return pre_output, out
 
 
+def python_api(input,
+               weight,
+               label,
+               path_table=None,
+               path_code=None,
+               bias=None,
+               num_classes=-1,
+               is_sparse=False,
+               remote_prefetch=False):
+    assert is_sparse == remote_prefetch, "is_sparse is equal to remote_prefetch in dygraph."
+    return paddle.nn.functional.hsigmoid_loss(input, label, num_classes, weight,
+                                              bias, path_table, path_code,
+                                              is_sparse)
+
+
+python_out_sig = ["Out"]
+
+
 class TestHSigmoidOp(OpTest):
 
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
+        self.python_api = python_api
+        self.python_out_sig = python_out_sig
         num_classes = 101
         feature_size = 5
         batch_size = 20
@@ -193,11 +213,12 @@ def setUp(self):
         self.user_grads = hsigmoid_grad(x, w, label, bias, num_classes)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
         self.check_grad(['X', 'W', 'Bias'], ['Out'],
-                        user_defined_grads=self.user_grads)
+                        user_defined_grads=self.user_grads,
+                        check_eager=True)
 
 
 @skip_check_grad_ci(
@@ -208,6 +229,8 @@ class TestHSigmoidOpSparse(OpTest):
 
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
+        self.python_api = python_api
+        self.python_out_sig = python_out_sig
         num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
         feature_size = 8
         batch_size = 4
@@ -237,7 +260,7 @@ def setUp(self):
         self.outputs = {'PreOut': pre_output, 'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
 
 class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
@@ -318,6 +341,8 @@ class TestHSigmoidOpWithCostumTree(OpTest):
 
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
+        self.python_api = python_api
+        self.python_out_sig = python_out_sig
         num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
         feature_size = 8
         batch_size = 4
@@ -347,10 +372,12 @@ def setUp(self):
         self.outputs = {'PreOut': pre_output, 'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))
+        self.check_grad(['Bias', 'X', 'W'], ['Out'],
+                        no_grad_set=set('Label'),
+                        check_eager=True)
 
 
 @skip_check_grad_ci(
@@ -361,6 +388,8 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
 
     def setUp(self):
         self.op_type = "hierarchical_sigmoid"
+        self.python_api = python_api
+        self.python_out_sig = python_out_sig
         num_classes = 6  #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample
         feature_size = 8
         batch_size = 4
@@ -394,10 +423,12 @@ def setUp(self):
         self.outputs = {'PreOut': pre_output, 'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['X', 'W'], ['Out'], no_grad_set=set('Label'))
+        self.check_grad(['X', 'W'], ['Out'],
+                        no_grad_set=set('Label'),
+                        check_eager=True)
 
 
 class TestHSigmoidLossAPI(unittest.TestCase):
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f661d7f9dbc93..9ebc5c03ef00b 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -920,7 +920,11 @@ def hsigmoid_loss(input,
             #  [2.11009121]
             #  [1.92374969]]
     """
-
+    if in_dygraph_mode():
+        out, _, _ = _C_ops.final_state_hierarchical_sigmoid(
+            input, weight, label, path_table, path_code, bias, num_classes,
+            is_sparse, 0, [], [], [], is_sparse)
+        return out
     if _non_static_mode():
         out, _, _ = _C_ops.hierarchical_sigmoid(input, weight, label,
                                                 path_table, path_code, bias,

From 8fc1cf6097062373db6874f6343bf27ab0f2780f Mon Sep 17 00:00:00 2001
From: shangliang Xu <ghostxsl@users.noreply.github.com>
Date: Wed, 27 Jul 2022 21:53:34 +0800
Subject: [PATCH 28/28] add matrix_nms in python/paddle/vision/ops.py (#44357)

---
 .../tests/unittests/test_matrix_nms_op.py     |  62 +++++--
 .../fluid/tests/unittests/test_ops_nms.py     |  16 ++
 python/paddle/vision/ops.py                   | 153 ++++++++++++++++--
 3 files changed, 201 insertions(+), 30 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
index 2e73e4d782d0b..c85b715b0cadf 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
@@ -19,6 +19,7 @@
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
 
 
 def softmax(x):
@@ -237,22 +238,22 @@ def set_argument(self):
 class TestMatrixNMSError(unittest.TestCase):
 
     def test_errors(self):
-        with program_guard(Program(), Program()):
-            M = 1200
-            N = 7
-            C = 21
-            BOX_SIZE = 4
-            nms_top_k = 400
-            keep_top_k = 200
-            score_threshold = 0.01
-            post_threshold = 0.
-
-            boxes_np = np.random.random((M, C, BOX_SIZE)).astype('float32')
-            scores = np.random.random((N * M, C)).astype('float32')
-            scores = np.apply_along_axis(softmax, 1, scores)
-            scores = np.reshape(scores, (N, M, C))
-            scores_np = np.transpose(scores, (0, 2, 1))
+        M = 1200
+        N = 7
+        C = 21
+        BOX_SIZE = 4
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = 0.01
+        post_threshold = 0.
 
+        boxes_np = np.random.random((M, C, BOX_SIZE)).astype('float32')
+        scores = np.random.random((N * M, C)).astype('float32')
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores_np = np.transpose(scores, (0, 2, 1))
+
+        with program_guard(Program(), Program()):
             boxes_data = fluid.data(name='bboxes',
                                     shape=[M, C, BOX_SIZE],
                                     dtype='float32')
@@ -268,6 +269,12 @@ def test_bboxes_Variable():
                                         keep_top_k=keep_top_k,
                                         score_threshold=score_threshold,
                                         post_threshold=post_threshold)
+                paddle.vision.ops.matrix_nms(bboxes=boxes_np,
+                                             scores=scores_data,
+                                             nms_top_k=nms_top_k,
+                                             keep_top_k=keep_top_k,
+                                             score_threshold=score_threshold,
+                                             post_threshold=post_threshold)
 
             def test_scores_Variable():
                 # the scores type must be Variable
@@ -277,6 +284,12 @@ def test_scores_Variable():
                                         keep_top_k=keep_top_k,
                                         score_threshold=score_threshold,
                                         post_threshold=post_threshold)
+                paddle.vision.ops.matrix_nms(bboxes=boxes_data,
+                                             scores=scores_np,
+                                             nms_top_k=nms_top_k,
+                                             keep_top_k=keep_top_k,
+                                             score_threshold=score_threshold,
+                                             post_threshold=post_threshold)
 
             def test_empty():
                 # when all score are lower than threshold
@@ -289,6 +302,15 @@ def test_empty():
                                             post_threshold=post_threshold)
                 except Exception as e:
                     self.fail(e)
+                try:
+                    paddle.vision.ops.matrix_nms(bboxes=boxes_data,
+                                                 scores=scores_data,
+                                                 nms_top_k=nms_top_k,
+                                                 keep_top_k=keep_top_k,
+                                                 score_threshold=10.,
+                                                 post_threshold=post_threshold)
+                except Exception as e:
+                    self.fail(e)
 
             def test_coverage():
                 # cover correct workflow
@@ -301,6 +323,16 @@ def test_coverage():
                                             post_threshold=post_threshold)
                 except Exception as e:
                     self.fail(e)
+                try:
+                    paddle.vision.ops.matrix_nms(
+                        bboxes=boxes_data,
+                        scores=scores_data,
+                        nms_top_k=nms_top_k,
+                        keep_top_k=keep_top_k,
+                        score_threshold=score_threshold,
+                        post_threshold=post_threshold)
+                except Exception as e:
+                    self.fail(e)
 
             self.assertRaises(TypeError, test_bboxes_Variable)
             self.assertRaises(TypeError, test_scores_Variable)
diff --git a/python/paddle/fluid/tests/unittests/test_ops_nms.py b/python/paddle/fluid/tests/unittests/test_ops_nms.py
index c775a47bd2472..3d6f2b717f261 100644
--- a/python/paddle/fluid/tests/unittests/test_ops_nms.py
+++ b/python/paddle/fluid/tests/unittests/test_ops_nms.py
@@ -197,6 +197,22 @@ def fun(x):
                     "origin out: {}\n inference model out: {}\n".format(
                         origin, res))
 
+    def test_matrix_nms_dynamic(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                boxes, scores, category_idxs, categories = gen_args(
+                    self.num_boxes, dtype)
+                scores = np.random.rand(1, 4, self.num_boxes).astype(dtype)
+                paddle.set_device(device)
+                out = paddle.vision.ops.matrix_nms(
+                    paddle.to_tensor(boxes).unsqueeze(0),
+                    paddle.to_tensor(scores),
+                    self.threshold,
+                    post_threshold=0.,
+                    nms_top_k=400,
+                    keep_top_k=100,
+                )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index cdb8417b6b9c2..aef90bb140d2b 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -24,21 +24,10 @@
 from paddle import _C_ops
 
 __all__ = [  #noqa
-    'yolo_loss',
-    'yolo_box',
-    'deform_conv2d',
-    'DeformConv2D',
-    'distribute_fpn_proposals',
-    'generate_proposals',
-    'read_file',
-    'decode_jpeg',
-    'roi_pool',
-    'RoIPool',
-    'psroi_pool',
-    'PSRoIPool',
-    'roi_align',
-    'RoIAlign',
-    'nms',
+    'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D',
+    'distribute_fpn_proposals', 'generate_proposals', 'read_file',
+    'decode_jpeg', 'roi_pool', 'RoIPool', 'psroi_pool', 'PSRoIPool',
+    'roi_align', 'RoIAlign', 'nms', 'matrix_nms'
 ]
 
 
@@ -1802,3 +1791,137 @@ def generate_proposals(scores,
         rpn_rois_num = None
 
     return rpn_rois, rpn_roi_probs, rpn_rois_num
+
+
+def matrix_nms(bboxes,
+               scores,
+               score_threshold,
+               post_threshold,
+               nms_top_k,
+               keep_top_k,
+               use_gaussian=False,
+               gaussian_sigma=2.,
+               background_label=0,
+               normalized=True,
+               return_index=False,
+               return_rois_num=True,
+               name=None):
+    """
+    This operator does matrix non maximum suppression (NMS).
+    First selects a subset of candidate bounding boxes that have higher scores
+    than score_threshold (if provided), then the top k candidate is selected if
+    nms_top_k is larger than -1. Score of the remaining candidate are then
+    decayed according to the Matrix NMS scheme.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+    Args:
+        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           The data type is float32 or float64.
+        scores (Tensor): A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes. The data type is float32 or float64.
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score.
+        post_threshold (float): Threshold to filter out bounding boxes with
+                                low confidence score AFTER decaying.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        return_rois_num(bool): whether return rois_num. Default: True
+        name(str): Name of the matrix nms op. Default: None.
+    Returns:
+        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
+        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
+             detection results.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
+            selected indices, which are absolute values cross batches.
+        rois_num (Tensor): A 1-D Tensor with shape [N] containing
+            the number of detected boxes in each image.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddle.vision.ops import matrix_nms
+            boxes = paddle.rand([4, 1, 4])
+            boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
+            boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
+            scores = paddle.rand([4, 80, 1])
+            out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
+                                 score_threshold=0.5, post_threshold=0.1,
+                                 nms_top_k=400, keep_top_k=200, normalized=False)
+    """
+    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
+                             'matrix_nms')
+    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
+                             'matrix_nms')
+    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
+    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
+    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
+    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
+    check_type(normalized, 'normalized', bool, 'matrix_nms')
+    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
+    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
+    check_type(background_label, 'background_label', int, 'matrix_nms')
+
+    if in_dygraph_mode():
+        attrs = ('background_label', background_label, 'score_threshold',
+                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
+                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
+                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
+                 normalized)
+        out, index, rois_num = _C_ops.matrix_nms(bboxes, scores, *attrs)
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return out, rois_num, index
+    else:
+        helper = LayerHelper('matrix_nms', **locals())
+        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+        index = helper.create_variable_for_type_inference(dtype='int32')
+        outputs = {'Out': output, 'Index': index}
+        if return_rois_num:
+            rois_num = helper.create_variable_for_type_inference(dtype='int32')
+            outputs['RoisNum'] = rois_num
+
+        helper.append_op(type="matrix_nms",
+                         inputs={
+                             'BBoxes': bboxes,
+                             'Scores': scores
+                         },
+                         attrs={
+                             'background_label': background_label,
+                             'score_threshold': score_threshold,
+                             'post_threshold': post_threshold,
+                             'nms_top_k': nms_top_k,
+                             'gaussian_sigma': gaussian_sigma,
+                             'use_gaussian': use_gaussian,
+                             'keep_top_k': keep_top_k,
+                             'normalized': normalized
+                         },
+                         outputs=outputs)
+        output.stop_gradient = True
+
+        if not return_index:
+            index = None
+        if not return_rois_num:
+            rois_num = None
+        return output, rois_num, index