From c239f15a8779c52840462f586b5f9f392f2bfd10 Mon Sep 17 00:00:00 2001
From: zhiboniu <31800336+zhiboniu@users.noreply.github.com>
Date: Wed, 13 Apr 2022 18:20:54 +0800
Subject: [PATCH 01/19] tensor fluid code transfer part2 (#41096)

---
 python/paddle/fft.py                          |    3 +-
 .../tests/unittests/test_crop_tensor_op.py    |   18 +-
 .../fluid/tests/unittests/test_slice_op.py    |    8 +-
 .../tests/unittests/test_strided_slice_op.py  |    8 +-
 python/paddle/tensor/attribute.py             |  129 +-
 python/paddle/tensor/creation.py              |  303 +++-
 python/paddle/tensor/linalg.py                |   54 +-
 python/paddle/tensor/manipulation.py          | 1338 ++++++++++++++++-
 python/paddle/tensor/random.py                |    2 +-
 python/paddle/tensor/search.py                |    2 +-
 python/paddle/tensor/stat.py                  |    2 +-
 11 files changed, 1756 insertions(+), 111 deletions(-)

diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 975e632558feb..10d637ff8b9ba 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -15,7 +15,8 @@
 from typing import Sequence
 import numpy as np
 import paddle
-from .tensor.attribute import is_complex, is_floating_point, is_integer, _real_to_complex_dtype, _complex_to_real_dtype
+from .tensor.attribute import is_complex, is_floating_point, is_integer
+from .tensor.creation import _real_to_complex_dtype, _complex_to_real_dtype
 from .fluid.framework import _non_static_mode
 from . import _C_ops
 from .fluid.data_feeder import check_variable_and_dtype
diff --git a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
index a4552c8f5ddbb..04e47bd30ce24 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_tensor_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 
 
@@ -225,31 +226,30 @@ def test_exception(self):
         offset = fluid.data(name='offset', shape=[1], dtype='int32')
 
         def attr_shape_type():
-            out = fluid.layers.crop_tensor(input1, shape=3)
+            out = paddle.crop(input1, shape=3)
 
         def attr_shape_dtype():
-            out = fluid.layers.crop_tensor(input1, shape=[2, 2.0, 3, 3])
+            out = paddle.crop(input1, shape=[2, 2.0, 3, 3])
 
         def attr_shape_value1():
-            out = fluid.layers.crop_tensor(input1, shape=[2, -2, dim, 3])
+            out = paddle.crop(input1, shape=[2, -2, dim, 3])
 
         def attr_shape_value2():
-            out = fluid.layers.crop_tensor(input1, shape=[2, 0, dim, 3])
+            out = paddle.crop(input1, shape=[2, 0, dim, 3])
 
         def attr_offsets_type():
-            out = fluid.layers.crop_tensor(
-                input1, shape=[2, 2, 3, 3], offsets=0)
+            out = paddle.crop(input1, shape=[2, 2, 3, 3], offsets=0)
 
         def attr_offsets_dtype():
-            out = fluid.layers.crop_tensor(
+            out = paddle.crop(
                 input1, shape=[2, 2, 3, 3], offsets=[0, 1.0, 0, 0])
 
         def attr_offsets_value():
-            out = fluid.layers.crop_tensor(
+            out = paddle.crop(
                 input1, shape=[2, 2, 3, 3], offsets=[0, -1, offset, 0])
 
         def input_dtype():
-            out = fluid.layers.crop_tensor(input2, shape=[2, 2, 3, 3])
+            out = paddle.crop(input2, shape=[2, 2, 3, 3])
 
         self.assertRaises(TypeError, attr_shape_type)
         self.assertRaises(TypeError, attr_shape_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index a565bba304184..34f296c4b6354 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -534,13 +534,13 @@ def test_1(self):
         # value_int64 is greater than 2147483647 which is the max of int32
         value_int64 = fluid.layers.fill_constant([1], "int64", 2147483648)
 
-        out_1 = fluid.layers.slice(
+        out_1 = paddle.slice(
             x, axes=[0, 1, 2], starts=[-3, 0, 2], ends=[value_int64, 100, -1])
-        out_2 = fluid.layers.slice(
+        out_2 = paddle.slice(
             x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, -1])
-        out_3 = fluid.layers.slice(
+        out_3 = paddle.slice(
             x, axes=[0, 1, 3], starts=[minus_3, 0, 2], ends=[3, 100, minus_1])
-        out_4 = fluid.layers.slice(x, axes=[0, 1, 2], starts=starts, ends=ends)
+        out_4 = paddle.slice(x, axes=[0, 1, 2], starts=starts, ends=ends)
 
         out_5 = x[-3:3, 0:100, 2:-1]
         out_6 = x[minus_3:3, 0:100, :, 2:-1]
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index ae17cb9b1b57c..4954cfc97e4e2 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -534,25 +534,25 @@ def test_1(self):
             shape=[3, 4, 5, 6],
             append_batch_size=False,
             dtype="float64")
-        out_1 = fluid.layers.strided_slice(
+        out_1 = paddle.strided_slice(
             x,
             axes=[0, 1, 2],
             starts=[-3, 0, 2],
             ends=[3, 100, -1],
             strides=[1, 1, 1])
-        out_2 = fluid.layers.strided_slice(
+        out_2 = paddle.strided_slice(
             x,
             axes=[0, 1, 3],
             starts=[minus_3, 0, 2],
             ends=[3, 100, -1],
             strides=[1, 1, 1])
-        out_3 = fluid.layers.strided_slice(
+        out_3 = paddle.strided_slice(
             x,
             axes=[0, 1, 3],
             starts=[minus_3, 0, 2],
             ends=[3, 100, minus_1],
             strides=[1, 1, 1])
-        out_4 = fluid.layers.strided_slice(
+        out_4 = paddle.strided_slice(
             x, axes=[0, 1, 2], starts=starts, ends=ends, strides=strides)
 
         out_5 = x[-3:3, 0:100:2, -1:2:-1]
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 07db7794b6d98..757b93dd88078 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -14,37 +14,128 @@
 
 from __future__ import print_function
 
-from ..framework import core
-from ..fluid.layer_helper import LayerHelper
+from ..framework import core, _non_static_mode
+from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.data_feeder import check_type
+
+from .creation import assign
+from .creation import _complex_to_real_dtype
 
 # TODO: define functions to get tensor attributes
-from ..fluid.layers import rank  # noqa: F401
-from ..fluid.layers import shape  # noqa: F401
 import paddle
 from paddle import _C_ops
-from paddle.static import Variable
+from ..static import Variable
 from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
+import numpy as np
+
 __all__ = []
 
 
-def _complex_to_real_dtype(dtype):
-    if dtype == core.VarDesc.VarType.COMPLEX64:
-        return core.VarDesc.VarType.FP32
-    elif dtype == core.VarDesc.VarType.COMPLEX128:
-        return core.VarDesc.VarType.FP64
-    else:
-        return dtype
+def rank(input):
+    """
+
+    The OP returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
+
+    Args:
+        input (Tensor): The input N-D tensor with shape of :math:`[N_1, N_2, ..., N_k]`, the data type is arbitrary.
+
+    Returns:
+        Tensor, the output data type is int32.: The 0-D tensor with the dimensions of the input Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            input = paddle.rand((3, 100, 100))
+            rank = paddle.rank(input)
+            print(rank)
+            # 3
+    """
+    check_type(input, 'input', (Variable), 'input')
+    ndims = len(input.shape)
+    out = assign(np.array(ndims, 'int32'))
+
+    return out
+
+
+def shape(input):
+    """
+    :alias_main: paddle.shape
+	:alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape
+	:old_api: paddle.fluid.layers.shape
+
+    **Shape Layer**
+
+    Get the shape of the input.
+
+    .. code-block:: text
+
+        Case1:
+            Given N-D Tensor:
+                input = [ [1, 2, 3, 4], [5, 6, 7, 8] ]
 
+            Then:
+                input.shape = [2, 4]
+
+        Case2:
+            Given SelectedRows:
+                input.rows = [0, 4, 19]
+                input.height = 20
+                input.value = [ [1, 2], [3, 4], [5, 6] ]  # inner tensor
+            Then:
+                input.shape = [3, 2]
+
+    Args:
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
+                          If input variable is type of SelectedRows, returns the shape of it's inner tensor.
+
+    Returns:
+        Variable (Tensor): The shape of the input variable.
+
+    Examples:
+        .. code-block:: python
 
-def _real_to_complex_dtype(dtype):
-    if dtype == core.VarDesc.VarType.FP32:
-        return core.VarDesc.VarType.COMPLEX64
-    elif dtype == core.VarDesc.VarType.FP64:
-        return core.VarDesc.VarType.COMPLEX128
-    else:
-        return dtype
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            paddle.enable_static()
+
+            inputs = fluid.data(name="x", shape=[3, 100, 100], dtype="float32")
+            output = fluid.layers.shape(inputs)
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(fluid.default_startup_program())
+
+            img = np.ones((3, 100, 100)).astype(np.float32)
+
+            res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
+            print(res) # [array([  3, 100, 100], dtype=int32)]
+    """
+    if in_dygraph_mode():
+        out = _C_ops.final_state_shape(input)
+        out.stop_gradient = True
+        return out
+    if _in_legacy_dygraph():
+        out = _C_ops.shape(input)
+        out.stop_gradient = True
+        return out
+
+    check_variable_and_dtype(input, 'input', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'shape')
+    helper = LayerHelper('shape', **locals())
+    out = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type='shape',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        stop_gradient=True)
+
+    return out
 
 
 def is_complex(x):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 95f145cf447b5..f4f1e7a3d5067 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -14,27 +14,138 @@
 
 from __future__ import print_function
 import numpy as np
+import math
 from paddle.common_ops_import import fill_constant
 from ..fluid.layers import utils
-
-from ..fluid.layers import tensor
 from ..static import Variable, device_guard
 from ..framework import _current_expected_place, _get_paddle_place
 from ..framework import dygraph_only
 from ..framework import core
-from ..fluid.layer_helper import LayerHelper
+from ..framework import in_dygraph_mode, _non_static_mode
+from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
-from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
 # TODO: define functions to get create a tensor  
-from ..fluid.layers import linspace  # noqa: F401
 import paddle
 from paddle import _C_ops
-from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _in_eager_without_dygraph_check
+from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check
+import warnings
 
 __all__ = []
 
 
+def _complex_to_real_dtype(dtype):
+    if dtype == core.VarDesc.VarType.COMPLEX64:
+        return core.VarDesc.VarType.FP32
+    elif dtype == core.VarDesc.VarType.COMPLEX128:
+        return core.VarDesc.VarType.FP64
+    else:
+        return dtype
+
+
+def _real_to_complex_dtype(dtype):
+    if dtype == core.VarDesc.VarType.FP32:
+        return core.VarDesc.VarType.COMPLEX64
+    elif dtype == core.VarDesc.VarType.FP64:
+        return core.VarDesc.VarType.COMPLEX128
+    else:
+        return dtype
+
+
+def linspace(start, stop, num, dtype=None, name=None):
+    r"""
+    This OP return fixed number of evenly spaced values within a given interval.
+
+    Args:
+        start(int|float|Tensor): The input :attr:`start` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
+        stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \
+            or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
+        num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \
+            or a Tensor of shape [1] with data type int32.
+        dtype(np.dtype|str, optional): The data type of output tensor, it could be
+            int32, int64, float32 and float64. Default: if None, the data type is float32.
+        name(str, optional): Normally there is no need for user to set this property. 
+            For more information, please refer to :ref:`api_guide_Name`.Default: None.
+
+    Returns:
+        Tensor: the output data type will be float32, float64. The 1-D tensor with fixed number of evenly spaced values, \
+        the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
+        the value with input :attr:`start`. 
+
+    Examples:
+        .. code-block:: python
+
+             import paddle
+             data = paddle.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
+             data = paddle.linspace(0, 10, 1, 'float32') # [0.0]
+
+    """
+    if dtype is None:
+        dtype = 'float32'
+    tensor_num = num
+    tensor_start = start
+    tensor_stop = stop
+    if not isinstance(num, Variable):
+        check_type(num, 'num', (int), 'linspace')
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    if not isinstance(start, Variable):
+        with device_guard("cpu"):
+            tensor_start = fill_constant([1], dtype, start)
+    if not isinstance(stop, Variable):
+        with device_guard("cpu"):
+            tensor_stop = fill_constant([1], dtype, stop)
+    if not isinstance(num, Variable):
+        with device_guard("cpu"):
+            tensor_num = fill_constant([1], 'int32', num)
+    if _non_static_mode():
+        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
+                               dtype)
+
+    helper = LayerHelper("linspace", **locals())
+
+    start_dtype = convert_dtype(tensor_start.dtype)
+    stop_dtype = convert_dtype(tensor_stop.dtype)
+    out_dtype = convert_dtype(dtype)
+    if isinstance(start, Variable):
+        check_dtype(start.dtype, 'start',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
+    else:
+        check_type(start, 'start', (int, float), 'linspace')
+
+    if isinstance(stop, Variable):
+        check_dtype(stop.dtype, 'stop',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
+    else:
+        check_type(stop, 'stop', (int, float), 'linspace')
+    if isinstance(num, Variable):
+        check_dtype(num.dtype, 'num', ['int32'], 'linspace')
+    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
+                'linspace')
+    if ((stop_dtype == "float64" or start_dtype == "float64") and
+            out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or
+                                                    start_dtype == "int64") and
+                                                   out_dtype == "int32"):
+        raise ValueError(
+            "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
+            "which may cause data type overflows. Please reset attr(dtype) of linspace."
+            .format(start_dtype, stop_dtype, dtype))
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    helper.append_op(
+        type='linspace',
+        inputs={'Start': tensor_start,
+                'Stop': tensor_stop,
+                'Num': tensor_num},
+        attrs={'dtype': dtype},
+        outputs={'Out': [out]})
+    if isinstance(num, int):
+        out.desc.set_shape((num, ))
+    return out
+
+
 @dygraph_only
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     r"""
@@ -60,7 +171,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         Tensor: A Tensor constructed from ``data`` .
 
     Raises:
-        TypeError: If the data type of ``data`` is not scalar, list, tuple, numpy.ndarray, paddle.Tensor
+        TypeError: If the data type of ``data`` is not scalar, list, tuple, np.ndarray, paddle.Tensor
         ValueError: If ``data`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]
         TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
         ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. 
@@ -152,7 +263,7 @@ def _handle_dtype(data, dtype):
             return data
         else:
             raise TypeError(
-                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
+                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor".
                 format(type(data)))
         if not dtype:
             if data.dtype in [
@@ -439,11 +550,39 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
         dtype = 'float32'
     if num_columns is None:
         num_columns = num_rows
-    return paddle.fluid.layers.eye(num_rows=num_rows,
-                                   num_columns=num_columns,
-                                   batch_shape=None,
-                                   dtype=dtype,
-                                   name=name)
+
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+    if num_columns is not None:
+        if not isinstance(num_columns, int) or num_columns < 0:
+            raise TypeError("num_columns should be a non-negative int")
+    else:
+        num_columns = num_rows
+
+    if _non_static_mode():
+        out = _C_ops.eye('dtype', dtype, 'num_rows', num_rows, 'num_columns',
+                         num_columns)
+
+    else:
+        helper = LayerHelper("eye", **locals())
+        check_dtype(dtype, 'dtype',
+                    ['float16', 'float32', 'float64', 'int32', 'int64'], 'eye')
+        if not isinstance(num_rows, int) or num_rows < 0:
+            raise TypeError("num_rows should be a non-negative int")
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+        helper.append_op(
+            type='eye',
+            inputs={},
+            outputs={'Out': [out]},
+            attrs={
+                'num_rows': num_rows,
+                'num_columns': num_columns,
+                'dtype': dtype
+            },
+            stop_gradient=True)
+
+    out.stop_gradient = True
+    return out
 
 
 def full(shape, fill_value, dtype=None, name=None):
@@ -564,7 +703,53 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         end = start
         start = 0
 
-    return paddle.fluid.layers.range(start, end, step, dtype, name)
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if not isinstance(start, Variable):
+        with device_guard("cpu"):
+            start = fill_constant([1], dtype, start, force_cpu=True)
+    elif start.dtype != dtype:
+        start = paddle.cast(start, dtype)
+
+    if not isinstance(end, Variable):
+        with device_guard("cpu"):
+            end = fill_constant([1], dtype, end, force_cpu=True)
+    elif end.dtype != dtype:
+        end = paddle.cast(end, dtype)
+
+    if not isinstance(step, Variable):
+        with device_guard("cpu"):
+            step = fill_constant([1], dtype, step, force_cpu=True)
+    elif step.dtype != dtype:
+        step = paddle.cast(step, dtype)
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_arange(start, end, step, dtype,
+                                         _current_expected_place())
+
+    if _in_legacy_dygraph():
+        out = _C_ops.range(start, end, step)
+        out.stop_gradient = True
+        return out
+
+    out_shape = None
+    if not isinstance(start, Variable) and not isinstance(
+            end, Variable) and not isinstance(step, Variable):
+        out_shape = [int(math.ceil((end - start) / step))]
+
+    check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
+                'range/arange')
+    helper = LayerHelper('range', **locals())
+    out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
+    helper.append_op(
+        type='range',
+        inputs={'Start': start,
+                'End': end,
+                'Step': step},
+        outputs={'Out': out})
+    out.stop_gradient = True
+    return out
 
 
 def _tril_triu_op(helper):
@@ -1187,7 +1372,7 @@ def assign(x, output=None):
     The OP copies the :attr:`x` to the :attr:`output`.
  
     Parameters:
-        x (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
+        x (Tensor|np.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
             or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
             Note: the float64 data will be converted to float32 because of current platform protobuf
             data limitation.
@@ -1211,9 +1396,91 @@ def assign(x, output=None):
           result2 = paddle.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
-    check_type(x, 'x', (Variable, np.ndarray, list, tuple, float, int, bool),
-               'assign')
-    return tensor.assign(x, output)
+    input = x
+    helper = LayerHelper('assign', **locals())
+    check_type(input, 'input', (Variable, np.ndarray, list, tuple, float, int,
+                                bool), 'assign')
+    is_inplace = True if output is not None else False
+
+    if np.isscalar(input) and not isinstance(input, str):
+        input = np.array([input])
+    elif isinstance(input, (list, tuple)):
+        input = np.array(input)
+    # NOTE(Aurelius84): Why we judge core.VarBase?
+    # In case of @to_static, a VarBase can be as input of `assign`,
+    # but _non_static_mode()==False under @to_static, which means
+    # isinstance(VarBase, Variable) == False. It will cause return None
+    # after this api.
+    if isinstance(input, (Variable, core.VarBase)):
+        if _non_static_mode():
+            if output is None:
+                if _in_legacy_dygraph():
+                    output = core.VarBase()
+                else:
+                    output = core.eager.Tensor()
+            _C_ops.assign(input, output)
+        else:
+            check_dtype(input.dtype, 'input', [
+                'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
+                'uint8', 'bool'
+            ], 'assign', '(When the type of input in assign is Variable.)')
+            if output is None:
+                output = helper.create_variable_for_type_inference(
+                    dtype=input.dtype)
+            helper.append_op(
+                type='assign', inputs={'X': [input]},
+                outputs={'Out': [output]})
+    elif isinstance(input, np.ndarray):
+        # Not support [var, var, ...] currently.
+        if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input):
+            raise TypeError(
+                "Required type(input) numpy.ndarray, but found `list(Variable)` in input."
+            )
+        dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == core.VarDesc.VarType.FP64:
+            # Setting FP64 numpy data is not supported in Paddle, so we
+            # use FP32 here
+            warnings.warn(
+                "paddle.assign doesn't support float64 input now due "
+                "to current platform protobuf data limitation, we convert "
+                "it to float32")
+            dtype = core.VarDesc.VarType.FP32
+        if dtype == core.VarDesc.VarType.BOOL:
+            value_name = "bool_values"
+            values = [int(v) for v in input.flat]
+        elif dtype == core.VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in input.flat]
+        elif dtype == core.VarDesc.VarType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in input.flat]
+        elif dtype == core.VarDesc.VarType.INT64:
+            value_name = "int64_values"
+            values = [int(v) for v in input.flat]
+        else:
+            raise TypeError(
+                "When the type of 'input' in assign is numpy.ndarray, "
+                "the data type of 'input' must be bool, float32, int32 or int64, but "
+                "received %s." % convert_dtype(dtype))
+        if input.size > 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+        if output is None:
+            output = helper.create_variable_for_type_inference(
+                dtype=input.dtype)
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            })
+
+    if is_inplace and _non_static_mode():
+        output._bump_inplace_version()
+
+    return output
 
 
 def clone(x, name=None):
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index a00ae8046ed68..4af4ac52209ef 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -13,14 +13,16 @@
 # limitations under the License.
 
 import numpy as np
-from ..fluid.layer_helper import LayerHelper
+from ..framework import LayerHelper
 from ..framework import _varbase_creator, _dygraph_tracer, in_dygraph_mode, _non_static_mode
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..static import Variable
 from ..fluid.framework import _in_legacy_dygraph
 from .manipulation import cast
+from .math import multiply, add
+from .logic import logical_not
+from .creation import full
 
-from ..fluid import layers
 import paddle
 from paddle.common_ops_import import core
 from paddle.common_ops_import import VarDesc
@@ -2532,11 +2534,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             y = paddle.to_tensor(y, dtype=x.dtype)
 
             condition = s > cutoff
-            cond_int = layers.cast(condition, s.dtype)
-            cond_not_int = layers.cast(layers.logical_not(condition), s.dtype)
-            out1 = layers.elementwise_mul(1 / s, cond_int)
-            out2 = layers.elementwise_mul(1 / y, cond_not_int)
-            singular = layers.elementwise_add(out1, out2)
+            cond_int = cast(condition, s.dtype)
+            cond_not_int = cast(logical_not(condition), s.dtype)
+            out1 = multiply(1 / s, cond_int)
+            out2 = multiply(1 / y, cond_not_int)
+            singular = add(out1, out2)
             st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2])
 
             dims = list(range(len(vt.shape)))
@@ -2559,11 +2561,11 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             y = paddle.to_tensor(y, dtype=s.dtype)
 
             condition = s_abs > cutoff
-            cond_int = layers.cast(condition, s.dtype)
-            cond_not_int = layers.cast(layers.logical_not(condition), s.dtype)
-            out1 = layers.elementwise_mul(1 / s, cond_int)
-            out2 = layers.elementwise_mul(1 / y, cond_not_int)
-            singular = layers.elementwise_add(out1, out2)
+            cond_int = cast(condition, s.dtype)
+            cond_not_int = cast(logical_not(condition), s.dtype)
+            out1 = multiply(1 / s, cond_int)
+            out2 = multiply(1 / y, cond_not_int)
+            singular = add(out1, out2)
             st, _ = _C_ops.unsqueeze2(singular, 'axes', [-2])
 
             out_1 = u * st
@@ -2597,17 +2599,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
                        'keep_dim': True,
                        'reduce_all': False})
 
-            rcond = layers.fill_constant(shape=[1], value=rcond, dtype=dtype)
+            rcond = full(shape=[1], fill_value=rcond, dtype=dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
-            y = layers.fill_constant(shape=[1], value=y, dtype=dtype)
+            y = full(shape=[1], fill_value=y, dtype=dtype)
 
             condition = s > cutoff
-            cond_int = layers.cast(condition, dtype)
-            cond_not_int = layers.cast(layers.logical_not(condition), dtype)
-            out1 = layers.elementwise_mul(1 / s, cond_int)
-            out2 = layers.elementwise_mul(1 / y, cond_not_int)
-            singular = layers.elementwise_add(out1, out2)
+            cond_int = cast(condition, dtype)
+            cond_not_int = cast(logical_not(condition), dtype)
+            out1 = multiply(1 / s, cond_int)
+            out2 = multiply(1 / y, cond_not_int)
+            singular = add(out1, out2)
 
             st = helper.create_variable_for_type_inference(dtype=dtype)
             st_shape = helper.create_variable_for_type_inference(dtype=dtype)
@@ -2682,17 +2684,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
                        'keep_dim': True,
                        'reduce_all': False})
 
-            rcond = layers.fill_constant(shape=[1], value=rcond, dtype=s_type)
+            rcond = full(shape=[1], fill_value=rcond, dtype=s_type)
             cutoff = rcond * max_singular_val
             y = float('inf')
-            y = layers.fill_constant(shape=[1], value=y, dtype=s_type)
+            y = full(shape=[1], fill_value=y, dtype=s_type)
 
             condition = s_abs > cutoff
-            cond_int = layers.cast(condition, s_type)
-            cond_not_int = layers.cast(layers.logical_not(condition), s_type)
-            out1 = layers.elementwise_mul(1 / s, cond_int)
-            out2 = layers.elementwise_mul(1 / y, cond_not_int)
-            singular = layers.elementwise_add(out1, out2)
+            cond_int = cast(condition, s_type)
+            cond_not_int = cast(logical_not(condition), s_type)
+            out1 = multiply(1 / s, cond_int)
+            out2 = multiply(1 / y, cond_not_int)
+            singular = add(out1, out2)
 
             st = helper.create_variable_for_type_inference(dtype=s_type)
             st_shape = helper.create_variable_for_type_inference(dtype=s_type)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 3a79abd2dc06e..b0e0082c6d9c4 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,32 +16,723 @@
 from collections import Counter
 
 from ..static import Variable, device_guard
-from ..framework import core
-from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _in_eager_without_dygraph_check, _non_static_mode
-from ..fluid.layer_helper import LayerHelper
+from ..framework import core, in_dygraph_mode
+from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _non_static_mode
+from ..framework import LayerHelper
 from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers import utils
 import numpy as np
 # TODO: define functions to manipulate a tensor  
-from ..fluid.layers import cast  # noqa: F401
-from ..fluid.layers import slice  # noqa: F401
-from ..fluid.layers import transpose  # noqa: F401
-from ..fluid.layers import unstack  # noqa: F401
-
-from ..fluid.layers import scatter_nd  # noqa: F401
-from ..fluid.layers import shard_index  # noqa: F401
-from ..fluid.layers import crop_tensor as crop  # noqa: F401
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
-from ..fluid import layers
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
 from paddle import _C_ops
-from paddle.tensor.attribute import _complex_to_real_dtype, _real_to_complex_dtype
+from ..common_ops_import import dygraph_utils, fill_constant, _varbase_creator
+import warnings
+from .creation import zeros
+from .creation import _complex_to_real_dtype
+from .creation import _real_to_complex_dtype
 
 __all__ = []
 
 
+def cast(x, dtype):
+    """
+
+    This OP takes in the Tensor :attr:`x` with :attr:`x.dtype` and casts it
+    to the output with :attr:`dtype`. It's meaningless if the output dtype
+    equals the input dtype, but it's fine if you do so.
+
+    Args:
+        x(Tensor): An input N-D Tensor with data type bool, float16,
+            float32, float64, int32, int64, uint8.
+        dtype(np.dtype|str): Data type of the output:
+            bool, float16, float32, float64, int8, int32, int64, uint8.
+
+    Returns:
+        Tensor: A Tensor with the same shape as input's.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([2, 3, 4], 'float64')
+            y = paddle.cast(x, 'uint8')
+    """
+    if in_dygraph_mode():
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        return _C_ops.final_state_cast(x, dtype)
+
+    if _non_static_mode():
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        out = _C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        return out
+
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64',
+        'uint8', 'uint16'
+    ], 'cast')
+    check_dtype(dtype, 'dtype', [
+        'bool', 'float16', 'float32', 'float64', 'int8', 'int16', 'int32',
+        'int64', 'uint8', 'uint16'
+    ], 'cast')
+
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=x.stop_gradient)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
+    return out
+
+
+def slice(input, axes, starts, ends):
+    """
+    This operator produces a slice of ``input`` along multiple axes. Similar to numpy:
+    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+    Slice uses ``axes``, ``starts`` and ``ends`` attributes to specify the start and
+    end dimension for each axis in the list of axes and Slice uses this information
+    to slice the input data tensor. If a negative value is passed to
+    ``starts`` or ``ends`` such as :math:`-i`,  it represents the reverse position of the
+    axis :math:`i-1` (here 0 is the initial position).
+    If the value passed to ``starts`` or ``ends`` is greater than n
+    (the number of elements in this dimension), it represents n.
+    For slicing to the end of a dimension with unknown size, it is recommended
+    to pass in INT_MAX. The size of ``axes`` must be equal to ``starts`` and ``ends``.
+    Following examples will explain how slice works:
+
+    .. code-block:: text
+
+        Case1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Case2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [0, 1]
+                ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
+            Then:
+                result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
+    
+    Args:
+        input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
+        starts (list|tuple|Tensor): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``starts`` is an Tensor, it should be an 1-D Tensor.
+                It represents starting indices of corresponding axis in ``axes``.
+        ends (list|tuple|Tensor): The data type is ``int32`` . If ``ends`` is a list or tuple, the elements of
+                it should be integers or Tensors with shape [1]. If ``ends`` is an Tensor, it should be an 1-D Tensor .
+                It represents ending indices of corresponding axis in ``axes``.
+
+    Returns:
+        Tensor:  A ``Tensor``. The data type is same as ``input``.
+
+    Raises:
+        TypeError: The type of ``starts`` must be list, tuple or Tensor.
+        TypeError: The type of ``ends`` must be list, tuple or Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            input = paddle.rand(shape=[4, 5, 6], dtype='float32')
+            # example 1:
+            # attr starts is a list which doesn't contain tensor.
+            axes = [0, 1, 2]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            sliced_1 = paddle.slice(input, axes=axes, starts=starts, ends=ends)
+            # sliced_1 is input[0:3, 0:2, 2:4].
+
+            # example 2:
+            # attr starts is a list which contain tensor.
+            minus_3 = paddle.full([1], -3, "int32")
+            sliced_2 = paddle.slice(input, axes=axes, starts=[minus_3, 0, 2], ends=ends)
+            # sliced_2 is input[0:3, 0:2, 2:4].
+    """
+    if in_dygraph_mode():
+        attrs = ()
+        starts_tensor = None
+        ends_tensor = None
+
+        if isinstance(axes, (list, tuple)):
+            axes = list(axes)
+            if len(axes) == 0:
+                raise ValueError(
+                    "Input axes should not be an empty list/tuple.")
+            for i in range(len(axes)):
+                if axes[i] < 0:
+                    axes[i] = max(0, axes[i] + len(input.shape))
+                else:
+                    axes[i] = min(len(input.shape) - 1, axes[i])
+
+        else:
+            raise ValueError(
+                "Input axes must be a python list or tuple, but reveived {}".
+                format(type(axes)))
+
+        infer_flags = list(1 for i in range(len(axes)))
+
+        tmp_tensor_type = core.eager.Tensor
+
+        if isinstance(starts, (list, tuple)):
+            starts = [
+                item.numpy().item(0)
+                if isinstance(item, tmp_tensor_type) else item
+                for item in starts
+            ]
+            attrs += ('starts', starts)
+        elif isinstance(starts, tmp_tensor_type):
+            starts_tensor = starts
+            starts.stop_gradient = True
+            infer_flags = list(-1 for i in range(len(axes)))
+
+        if isinstance(ends, (list, tuple)):
+            ends = [
+                item.numpy().item(0)
+                if isinstance(item, tmp_tensor_type) else item for item in ends
+            ]
+            attrs += ('ends', ends)
+        elif isinstance(ends, tmp_tensor_type):
+            ends_tensor = ends
+            ends_tensor.stop_gradient = True
+            infer_flags = list(-1 for i in range(len(axes)))
+        return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
+                            'axes', axes, 'infer_flags', infer_flags, *attrs)
+    else:
+        if _in_legacy_dygraph():
+            attrs = ()
+            starts_tensor = None
+            ends_tensor = None
+
+            if isinstance(axes, (list, tuple)):
+                axes = list(axes)
+                if len(axes) == 0:
+                    raise ValueError(
+                        "Input axes should not be an empty list/tuple.")
+                for i in range(len(axes)):
+                    if axes[i] < 0:
+                        axes[i] = max(0, axes[i] + len(input.shape))
+                    else:
+                        axes[i] = min(len(input.shape) - 1, axes[i])
+
+            else:
+                raise ValueError(
+                    "Input axes must be a python list or tuple, but reveived {}".
+                    format(type(axes)))
+
+            infer_flags = list(1 for i in range(len(axes)))
+
+            tmp_tensor_type = Variable
+
+            if isinstance(starts, (list, tuple)):
+                starts = [
+                    item.numpy().item(0)
+                    if isinstance(item, tmp_tensor_type) else item
+                    for item in starts
+                ]
+                attrs += ('starts', starts)
+            elif isinstance(starts, tmp_tensor_type):
+                starts_tensor = starts
+                starts.stop_gradient = True
+                infer_flags = list(-1 for i in range(len(axes)))
+
+            if isinstance(ends, (list, tuple)):
+                ends = [
+                    item.numpy().item(0)
+                    if isinstance(item, tmp_tensor_type) else item
+                    for item in ends
+                ]
+                attrs += ('ends', ends)
+            elif isinstance(ends, tmp_tensor_type):
+                ends_tensor = ends
+                ends_tensor.stop_gradient = True
+                infer_flags = list(-1 for i in range(len(axes)))
+
+            return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
+                                'axes', axes, 'infer_flags', infer_flags,
+                                *attrs)
+
+    if not isinstance(starts, (list, tuple, Variable)):
+        raise ValueError(
+            "Input starts must be an Variable, python list or tuple.")
+    if not isinstance(ends, (list, tuple, Variable)):
+        raise ValueError(
+            "Input ends must be an Variable, python list or tuple.")
+
+    helper = LayerHelper('slice', **locals())
+
+    inputs = {'Input': input}
+    attrs = {'axes': axes}
+    infer_flags = list(1 for i in range(len(axes)))
+
+    # starts
+    if isinstance(starts, Variable):
+        starts.stop_gradient = True
+        inputs['StartsTensor'] = starts
+        infer_flags = list(-1 for i in range(len(axes)))
+    elif isinstance(starts, (list, tuple)):
+        attrs['starts'] = []
+        if utils._contain_var(starts):
+            inputs['StartsTensorList'] = utils._convert_to_tensor_list(starts)
+            for i, dim in enumerate(starts):
+                if isinstance(dim, Variable):
+                    attrs['starts'].append(-1)
+                    infer_flags[i] = -1
+                else:
+                    attrs['starts'].append(dim)
+        else:
+            attrs['starts'] = starts
+
+    # ends
+    if isinstance(ends, Variable):
+        ends.stop_gradient = True
+        inputs['EndsTensor'] = ends
+        infer_flags = list(-1 for i in range(len(axes)))
+    elif isinstance(ends, (list, tuple)):
+        attrs['ends'] = []
+        if utils._contain_var(ends):
+            inputs['EndsTensorList'] = utils._convert_to_tensor_list(ends)
+            for i, dim in enumerate(ends):
+                if isinstance(dim, Variable):
+                    attrs['ends'].append(-1)
+                    infer_flags[i] = -1
+                else:
+                    attrs['ends'].append(dim)
+        else:
+            attrs['ends'] = ends
+
+    # infer_flags
+    attrs['infer_flags'] = infer_flags
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('input'))
+    helper.append_op(
+        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
+
+    return out
+
+
+def transpose(x, perm, name=None):
+    """
+    Permute the data dimensions of `input` according to `perm`.
+
+    The `i`-th dimension  of the returned tensor will correspond to the
+    perm[i]-th dimension of `input`.
+
+    Args:
+        x (Tensor): The input Tensor. It is a N-D Tensor of data types bool, float32, float64, int32.
+        perm (list|tuple): Permute the input according to the data of perm.
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Tensor: A transposed n-D Tensor, with data type being bool, float32, float64, int32, int64.
+
+    For Example:
+
+        .. code-block:: text
+
+         x = [[[ 1  2  3  4] [ 5  6  7  8] [ 9 10 11 12]]
+             [[13 14 15 16] [17 18 19 20] [21 22 23 24]]]
+         shape(x) =  [2,3,4]
+
+         # Example 1
+         perm0 = [1,0,2]
+         y_perm0 = [[[ 1  2  3  4] [13 14 15 16]]
+                   [[ 5  6  7  8]  [17 18 19 20]]
+                   [[ 9 10 11 12]  [21 22 23 24]]]
+         shape(y_perm0) = [3,2,4]
+
+         # Example 2
+         perm1 = [2,1,0]
+         y_perm1 = [[[ 1 13] [ 5 17] [ 9 21]]
+                   [[ 2 14] [ 6 18] [10 22]]
+                   [[ 3 15]  [ 7 19]  [11 23]]
+                   [[ 4 16]  [ 8 20]  [12 24]]]
+         shape(y_perm1) = [4,3,2]
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.randn([2, 3, 4])
+            x_transposed = paddle.transpose(x, perm=[1, 0, 2])
+            print(x_transposed.shape)
+            # [3L, 2L, 4L]
+
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_transpose(x, perm)
+    else:
+        if _in_legacy_dygraph():
+            out, _ = _C_ops.transpose2(x, 'axis', perm)
+            return out
+
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
+        'complex128'
+    ], 'transpose')
+    check_type(perm, 'perm', (list, tuple), 'transpose')
+    if isinstance(perm, tuple):
+        perm = list(perm)
+    if len(perm) != len(x.shape):
+        raise ValueError(
+            "Input(perm) is the permutation of dimensions of Input(x), "
+            "its length should be equal to dimensions of Input(x), "
+            "but received dimension of Input(x) is %s, "
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+    for idx, dim in enumerate(perm):
+        if dim >= len(x.shape):
+            raise ValueError(
+                "Each element in Input(perm) should be less than Input(x)'s dimension, "
+                "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
+                "dimension %d." % (idx, perm[idx], len(x.shape)))
+
+    helper = LayerHelper('transpose', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    x_shape = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
+        attrs={'axis': perm})
+    return out
+
+
+def unstack(x, axis=0, num=None):
+    """
+    :alias_main: paddle.unstack
+	:alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
+	:old_api: paddle.fluid.layers.unstack
+
+    **UnStack Layer**
+
+    This layer unstacks input Tensor :code:`x` into several Tensors along :code:`axis`.
+
+    If :code:`axis` < 0, it would be replaced with :code:`axis+rank(x)`.
+    If :code:`num` is None, it would be inferred from :code:`x.shape[axis]`,
+    and if :code:`x.shape[axis]` <= 0 or is unknown, :code:`ValueError` is
+    raised.
+
+    Args:
+        x (Tensor): Input Tensor. It is a N-D Tensors of data types float32, float64, int32, int64.
+        axis (int): The axis along which the input is unstacked.
+        num (int|None): The number of output variables.
+
+    Returns:
+        list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
+
+    Raises:
+        ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D).
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.ones(name='x', shape=[2, 3, 5], dtype='float32')  # create a tensor with shape=[2, 3, 5]
+            y = paddle.unstack(x, axis=1)  # unstack with second axis, which results 3 tensors with shape=[2, 5]
+
+    """
+    if _non_static_mode():
+        if num == None:
+            num = x.shape[axis]
+        if num == 0:
+            return []
+        return _C_ops.unstack(x, num, 'axis', int(axis), 'num', num)
+
+    helper = LayerHelper('unstack', **locals())
+    if num is None:
+        if axis is None or x.shape[axis] <= 0:
+            raise ValueError('unknown unstack number')
+        else:
+            num = x.shape[axis]
+
+    outs = []
+    for _ in range(num):
+        outs.append(helper.create_variable_for_type_inference(x.dtype))
+
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis,
+               'num': num})
+    return outs
+
+
+def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
+    """
+    Reset the values of `input` according to the shard it beloning to.
+    Every value in `input` must be a non-negative integer, and
+    the parameter `index_num` represents the integer above the maximum
+    value of `input`. Thus, all values in `input` must be in the range
+    [0, index_num) and each value can be regarded as the offset to the beginning
+    of the range. The range is further split into multiple shards. Specifically,
+    we first compute the `shard_size` according to the following formula,
+    which represents the number of integers each shard can hold. So for the
+    i'th shard, it can hold values in the range [i*shard_size, (i+1)*shard_size).
+    ::
+
+        shard_size = (index_num + nshards - 1) // nshards
+
+    For each value `v` in `input`, we reset it to a new value according to the
+    following formula:
+    ::
+   
+        v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
+
+    That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
+    if it in the range. Otherwise, we reset it to be `ignore_value`.
+
+    Args:
+        input (Tensor): Input tensor with data type int64 or int32. It's last dimension must be 1.
+        index_num (int): An integer represents the integer above the maximum value of `input`.
+        nshards (int): The number of shards.
+        shard_id (int): The index of the current shard.
+        ignore_value (int): An integer value out of sharded index range.
+
+    Returns:
+        Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            label = paddle.to_tensor([[16], [1]], "int64")
+            shard_label = paddle.shard_index(input=label,
+                                             index_num=20,
+                                             nshards=2,
+                                             shard_id=0)
+            print(shard_label)
+            # [[-1], [1]]
+    """
+    if in_dygraph_mode():
+        return _C_ops.final_state_shard_index(input, index_num, nshards,
+                                              shard_id, ignore_value)
+
+    check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
+    op_type = 'shard_index'
+    helper = LayerHelper(op_type, **locals())
+    if shard_id < 0 or shard_id >= nshards:
+        raise ValueError('The shard_id(%d) should be in [0, %d)' %
+                         (shard_id, nshards))
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [input]},
+        outputs={'Out': out},
+        attrs={
+            'index_num': index_num,
+            'nshards': nshards,
+            'shard_id': shard_id,
+            'ignore_value': ignore_value
+        },
+        stop_gradient=True)
+    return out
+
+
+def crop(x, shape=None, offsets=None, name=None):
+    """
+    Crop input into output, as specified by offsets and shape.
+
+    .. code-block:: text
+
+        * Case 1 (input is a 2-D Tensor):
+            Input:
+                X.shape = [3, 5]
+                X.data = [[0, 1, 2, 0, 0],
+                          [0, 3, 4, 0, 0],
+                          [0, 0, 0, 0, 0]]
+            Parameters:
+                shape = [2, 2]
+                offsets = [0, 1]
+            Output:
+                Out.shape = [2, 2]
+                Out.data = [[1, 2],
+                            [3, 4]]
+        * Case 2 (input is a 3-D Tensor):
+            Input:
+                X.shape = [2, 3, 4]
+                X.data =  [[[0, 1, 2, 3],
+                            [0, 5, 6, 7],
+                            [0, 0, 0, 0]],
+                           [[0, 3, 4, 5],
+                            [0, 6, 7, 8],
+                            [0, 0, 0, 0]]]
+            Parameters:
+                shape = [2, 2, -1]
+                offsets = [0, 0, 1]
+            Output:
+                Out.shape = [2, 2, 3]
+                Out.data  = [[[1, 2, 3],
+                              [5, 6, 7]],
+                             [[3, 4, 5],
+                              [6, 7, 8]]]
+
+    Parameters:
+        x (Tensor): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64.
+        shape (list|tuple|Tensor): The output shape is specified
+            by `shape`. Its data type is int32. If a list/tuple, it's length must be
+            the same as the dimension size of `x`. If a Tensor, it should be a 1-D Tensor.
+            When it is a list, each element can be an integer or a Tensor of shape: [1].
+            If Variable contained, it is suitable for the case that the shape may
+            be changed each iteration.
+        offsets (list|tuple|Variable, optional): Specifies the cropping
+            offsets at each dimension. Its data type is int32. If a list/tuple, it's length
+            must be the same as the dimension size of `x`. If a Tensor, it should be a 1-D
+            Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1].
+            If Variable contained, it is suitable for the case that the offsets may be changed
+            each iteration. Default: None, the offsets are 0 at each dimension.
+        name(str, optional): The default value is None. Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
+
+    Returns:
+        Tensor: The cropped Tensor has same data type with `x`.
+
+    Examples:
+
+        .. code-block:: python
+          :name: code-example1
+
+            import paddle
+            x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+            # x.shape = [3, 3]
+            # x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+
+            # shape can be a 1-D Tensor or list or tuple.
+            shape = paddle.to_tensor([2, 2], dtype='int32')
+            # shape = [2, 2]
+            # shape = (2, 2)
+            out = paddle.crop(x, shape)
+            # out.shape = [2, 2]
+            # out = [[1,2], [4,5]]
+
+            # offsets can be a 1-D Tensor or list or tuple.
+            offsets = paddle.to_tensor([0, 1], dtype='int32')
+            # offsets = [1, 0]
+            # offsets = (1, 1)
+            out = paddle.crop(x, shape, offsets)
+            # out.shape = [2, 2]
+            # if offsets = [0, 0], out = [[1,2], [4,5]]
+            # if offsets = [0, 1], out = [[2,3], [5,6]]
+            # if offsets = [1, 0], out = [[4,5], [7,8]]
+            # if offsets = [1, 1], out = [[5,6], [8,9]]
+
+    """
+    helper = LayerHelper('crop_tensor', **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'crop_tensor')
+    check_type(shape, 'shape', (list, tuple, Variable), 'crop_tensor')
+    check_type(offsets, 'offsets', (list, tuple, Variable, type(None)),
+               'crop_tensor')
+
+    if offsets is None:
+        offsets = [0] * len(x.shape)
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    ipts = {'X': x}
+    attrs = {}
+
+    def _attr_shape_check(shape_val):
+        if not isinstance(shape_val, int):
+            raise TypeError(
+                "Attr(shape)'s dtype of Op(crop_tensor) should be int32, but received: %s."
+                % type(shape_val))
+        if shape_val == 0:
+            raise ValueError(
+                "Attr(shape) of Op(crop_tensor) should not be zero, but received: %s."
+                % str(shape_val))
+        if shape_val < -1:
+            raise ValueError(
+                "When the element in Attr(shape) of Op(crop_tensor) is negative, only -1 is supported, but received: %s."
+                % str(shape_val))
+
+    def _attr_offsets_check(offset_val):
+        if not isinstance(offset_val, int):
+            raise TypeError(
+                "Attr(offsets)'s dtype of Op(crop_tensor) should be int32, but received: %s."
+                % type(offset_val))
+        if offset_val < 0:
+            raise ValueError(
+                "Attr(offsets) of Op(crop_tensor) should be greater or equal to zero, but received: %s."
+                % str(offset_val))
+
+    if isinstance(offsets, Variable):
+        offsets.stop_gradient = True
+        ipts['Offsets'] = offsets
+        attrs['offsets'] = [-1] * len(x.shape)
+    elif utils._contain_var(offsets):
+        new_offsets_tensor = []
+        offsets_attr = []
+        for dim in offsets:
+            if isinstance(dim, Variable):
+                dim.stop_gradient = True
+                new_offsets_tensor.append(dim)
+                offsets_attr.append(-1)
+            else:
+                _attr_offsets_check(dim)
+                temp_out = helper.create_variable_for_type_inference('int32')
+                fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
+                new_offsets_tensor.append(temp_out)
+                offsets_attr.append(dim)
+        ipts['OffsetsTensor'] = new_offsets_tensor
+        attrs['offsets'] = offsets_attr
+    else:
+        for offset in offsets:
+            _attr_offsets_check(offset)
+        attrs['offsets'] = offsets
+
+    if isinstance(shape, Variable):
+        shape.stop_gradient = True
+        ipts['Shape'] = shape
+    elif utils._contain_var(shape):
+        new_shape_tensor = []
+        shape_attr = []
+        for dim_size in shape:
+            if isinstance(dim_size, Variable):
+                dim_size.stop_gradient = True
+                new_shape_tensor.append(dim_size)
+                shape_attr.append(0)
+            else:
+                _attr_shape_check(dim_size)
+                temp_out = helper.create_variable_for_type_inference('int32')
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
+                new_shape_tensor.append(temp_out)
+                shape_attr.append(dim_size)
+        ipts['ShapeTensor'] = new_shape_tensor
+        attrs['shape'] = shape_attr
+    else:
+        for dim_size in shape:
+            _attr_shape_check(dim_size)
+        attrs['shape'] = shape
+
+    helper.append_op(
+        type='crop_tensor',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
+
+
 @dygraph_only
 def fill_(x, value):
     """
@@ -328,7 +1019,74 @@ def concat(x, axis=0, name=None):
             #  [11 12 13]
             #  [14 15 16]]
     """
-    return paddle.fluid.layers.concat(input=x, axis=axis, name=name)
+    input = x
+    if in_dygraph_mode():
+        if isinstance(axis, Variable):
+            axis = axis.numpy()
+            axis = axis.item(0)
+        if not isinstance(input, Variable):
+            input = [t for t in input if t.shape.count(0) == 0]
+        return _C_ops.final_state_concat(input, axis)
+
+    if _in_legacy_dygraph():
+        if isinstance(axis, Variable):
+            axis = axis.numpy()
+            axis = axis.item(0)
+        if not isinstance(input, Variable):
+            input = [t for t in input if t.shape.count(0) == 0]
+        out = _varbase_creator()
+        _C_ops.concat(input, out, 'axis', axis)
+        return out
+
+    check_type(input, 'input', (list, tuple, Variable), 'concat')
+    if not isinstance(input, Variable):
+        for id, x in enumerate(input):
+            check_variable_and_dtype(
+                x, 'input[' + str(id) + ']',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'concat')
+            if x.dtype != input[0].dtype:
+                raise TypeError(
+                    "All the Tensors in the input must have the same data type.")
+    else:
+        input = [input]
+    check_type(axis, 'axis', (int, Variable), 'concat')
+
+    if isinstance(axis, Variable):
+        check_dtype(
+            axis.dtype, 'axis', ['int32', 'int64'], 'concat',
+            "The data type of axis must be int32 or int64 when axis is a Tensor")
+
+    helper = LayerHelper('concat', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+
+    if input[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        # NOTE(liym27): Don't remove this if branch!
+        # This feature is supported for Dynamic-to-Static, because after transformed, the type of inputs[0]
+        # is LOD_TENSOR_ARRAY in some scenarios. And this feature can be used in static mode.
+
+        assert len(input) == 1, "If the elements of 'input' in concat are Variable(LoDTensorArray), " \
+                "number of the elements must be 1, but received %s." % len(input)
+        out_index = helper.create_variable_for_type_inference(dtype="int32")
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': input[0]},
+            outputs={'Out': [out],
+                     'OutIndex': [out_index]},
+            attrs={'axis': axis,
+                   'use_stack': False})
+    else:
+        inputs = {'X': input}
+        attrs = {}
+        if isinstance(axis, Variable):
+            axis.stop_gradient = True
+            inputs['AxisTensor'] = axis
+        else:
+            attrs['axis'] = axis
+
+        helper.append_op(
+            type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
+    return out
 
 
 def broadcast_tensors(input, name=None):
@@ -900,7 +1658,53 @@ def stack(x, axis=0, name=None):
 	    #   [3., 4.],
 	    #   [5., 6.]]]
     """
-    return layers.stack(x, axis, name)
+    axis = 0 if axis is None else axis
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_stack(x, axis)
+
+    if _in_legacy_dygraph():
+        return _C_ops.stack(x, 'axis', axis)
+
+    if not isinstance(x, list) and not isinstance(x, tuple):
+        # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
+        # In that case, Variable is array of tensors indeed.
+        if isinstance(x, Variable) and x.desc.type(
+        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+            x = [x]
+        else:
+            raise TypeError("The type of '%s' in %s must be %s, but received %s"
+                            % ('x', 'stack',
+                               'list[Tensor], tuple[Tensor] or TensorArray',
+                               type(x)))
+
+    helper = LayerHelper('stack', **locals())
+
+    out = helper.create_variable_for_type_inference(x[0].dtype)
+    if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
+                            "number of the elements must be 1, but received %s." % len(x)
+        out_index = helper.create_variable_for_type_inference(dtype="int32")
+
+        for i in x:
+            check_variable_and_dtype(i, 'x', \
+                ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
+
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': x[0]},
+            outputs={'Out': [out],
+                     'OutIndex': [out_index]},
+            attrs={'axis': axis,
+                   'use_stack': True})
+    else:
+        helper.append_op(
+            type='stack',
+            inputs={'X': x},
+            outputs={'Y': out},
+            attrs={'axis': axis})
+
+    return out
 
 
 def split(x, num_or_sections, axis=0, name=None):
@@ -951,8 +1755,110 @@ def split(x, num_or_sections, axis=0, name=None):
             print(out1.shape)  # [3, 3, 5]
             print(out2.shape)  # [3, 3, 5]
     """
-    return paddle.fluid.layers.split(
-        input=x, num_or_sections=num_or_sections, dim=axis, name=name)
+    input = x
+    dim = axis
+    if _non_static_mode():
+        num = None
+        attrs = ()
+
+        if isinstance(dim, Variable):
+            dim = dim.numpy()
+            dim = dim.item(0)
+        assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0"
+        dim = (len(input.shape) + dim) if dim < 0 else dim
+        attrs += ('axis', dim)
+
+        if isinstance(num_or_sections, int):
+            num = num_or_sections
+            attrs += ('num', num_or_sections)
+        elif isinstance(num_or_sections, (list, tuple)):
+            num = len(num_or_sections)
+            if utils._contain_var(num_or_sections):
+                for index, item in enumerate(num_or_sections):
+                    if isinstance(item, Variable):
+                        num_or_sections[index] = num_or_sections[index].numpy()[
+                            0]
+                attrs += ('sections', list(num_or_sections))
+            else:
+                attrs += ('sections', list(num_or_sections))
+        else:
+            raise TypeError(
+                "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
+                "received %s." % (type(num_or_sections)))
+        out = [_varbase_creator() for n in range(num)]
+        _C_ops.split(input, out, *attrs)
+        return out
+
+    check_variable_and_dtype(
+        input, 'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'split')
+    check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
+    check_type(dim, 'dim', (int, Variable), 'split')
+    if isinstance(dim, Variable):
+        check_dtype(dim.dtype, 'dim', ['int32', 'int64'], 'split')
+
+    helper = LayerHelper('split', **locals())
+
+    input_shape = input.shape
+    inputs = {'X': input}
+    attrs = {'num': num_or_sections if isinstance(num_or_sections, int) else 0}
+
+    def _get_SectionsTensorList(one_list):
+        tensor_list = []
+        unk_dim_idx = -1
+        for idx, dim_size in enumerate(one_list):
+            if isinstance(dim_size, Variable):
+                dim_size.stop_gradient = True
+                tensor_list.append(dim_size)
+            else:
+                assert (isinstance(dim_size, int))
+                if dim_size == -1:
+                    assert unk_dim_idx == -1, (
+                        "Only one value of 'num_or_section' in split can "
+                        "be -1. But received num_or_section[%d] is also -1." %
+                        idx)
+                    unk_dim_idx = idx
+                temp_out = helper.create_variable_for_type_inference('int32')
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out)
+                tensor_list.append(temp_out)
+        return tensor_list
+
+    if isinstance(dim, Variable):
+        dim.stop_gradient = True
+        inputs['AxisTensor'] = dim
+    else:
+        assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0"
+        dim = (len(input_shape) + dim) if dim < 0 else dim
+        attrs['axis'] = dim
+
+    if isinstance(num_or_sections, int):
+        assert num_or_sections > 1, 'num_or_sections must be more than 1.'
+        if isinstance(dim, int) and input_shape[dim] > 0:
+            assert input_shape[dim] % num_or_sections ==0, \
+                "The input's size along the split dimension " \
+                "must be evenly divisible by Attr(num_or_sections). " \
+                "But %d is not evenly divisible by %d. " % (num_or_sections,input_shape[dim])
+        num = num_or_sections
+    else:
+        if isinstance(dim, int) and input_shape[dim] > 0:
+            assert len(num_or_sections) <= input_shape[
+                dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+        num = len(num_or_sections)
+        attrs['sections'] = list(
+            map(lambda ele: -1 if isinstance(ele, Variable) else ele,
+                num_or_sections))
+        if utils._contain_var(num_or_sections):
+            inputs['SectionsTensorList'] = _get_SectionsTensorList(
+                num_or_sections)
+
+    outs = [
+        helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+        for i in range(num)
+    ]
+    helper.append_op(
+        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs)
+    return outs
 
 
 def squeeze(x, axis=None, name=None):
@@ -1035,7 +1941,30 @@ def squeeze(x, axis=None, name=None):
     elif isinstance(axis, tuple):
         axis = list(axis)
 
-    return layers.squeeze(x, axis, name)
+    input = x
+    axes = axis
+    if in_dygraph_mode():
+        return _C_ops.final_state_squeeze(input, axes)[1]
+    if _in_legacy_dygraph():
+        out, _ = _C_ops.squeeze2(input, 'axes', axes)
+        return out
+
+    helper = LayerHelper("squeeze", **locals())
+    check_variable_and_dtype(input, 'input', [
+        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
+        'complex64', 'complex128'
+    ], 'squeeze')
+    check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type="squeeze2",
+        inputs={"X": input},
+        attrs={"axes": axes},
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+    return out
 
 
 @inplace_apis_in_dygraph_only
@@ -1335,8 +2264,61 @@ def unsqueeze(x, axis, name=None):
             print(out3[0, 0, 0, 0, 0]) # [10.]
             
     """
+    input = x
+    axes = axis
+    if _non_static_mode():
+        if isinstance(axes, int):
+            axes = [axes]
+        elif isinstance(axes, Variable):
+            axes = axes.numpy().tolist()
+        elif isinstance(axes, (list, tuple)):
+            axes = [
+                item.numpy().item(0) if isinstance(item, Variable) else item
+                for item in axes
+            ]
+        if _in_legacy_dygraph():
+            out, _ = _C_ops.unsqueeze2(input, 'axes', axes)
+            return out
+        return _C_ops.final_state_unsqueeze(input, axes)[1]
+
+    check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
+    check_variable_and_dtype(input, 'input', [
+        'float16',
+        'float32',
+        'float64',
+        'bool',
+        'int8',
+        'int16',
+        'int32',
+        'int64',
+        'complex64',
+        'complex128',
+    ], 'unsqueeze')
+    helper = LayerHelper("unsqueeze2", **locals())
+    inputs = {"X": input}
+    attrs = {}
+
+    if isinstance(axes, int):
+        axes = [axes]
+    if isinstance(axes, Variable):
+        axes.stop_gradient = True
+        inputs["AxesTensor"] = axes
+    elif isinstance(axes, (list, tuple)):
+        if utils._contain_var(axes):
+            inputs["AxesTensorList"] = utils._convert_to_tensor_list(axes)
+        else:
+            attrs["axes"] = axes
+
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type="unsqueeze2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
-    return layers.unsqueeze(x, axis, name)
+    return out
 
 
 @inplace_apis_in_dygraph_only
@@ -1680,7 +2662,70 @@ def scatter_nd_add(x, index, updates, name=None):
             index = paddle.to_tensor(index_data)
             output = paddle.scatter_nd_add(x, index, updates)
     """
-    return layers.scatter_nd_add(x, index, updates, name=None)
+    if in_dygraph_mode():
+        op = getattr(_C_ops, 'scatter_nd_add')
+        return op(x, index, updates)
+    else:
+        if _in_legacy_dygraph():
+            op = getattr(_C_ops, 'scatter_nd_add')
+            return op(x, index, updates)
+        else:
+            if x.dtype != updates.dtype:
+                raise ValueError("x and updates must have same data type.")
+
+            helper = LayerHelper('scatter_nd_add', **locals())
+            dtype = helper.input_dtype(input_param_name='x')
+            output = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(
+                type="scatter_nd_add",
+                inputs={"X": x,
+                        "Index": index,
+                        "Updates": updates},
+                outputs={"Out": output})
+            return output
+
+
+def scatter_nd(index, updates, shape, name=None):
+    """
+    **Scatter_nd Layer**
+
+    Output is obtained by scattering the :attr:`updates` in a new tensor according
+    to :attr:`index` . This op is similar to :code:`scatter_nd_add`, except the
+    tensor of :attr:`shape` is zero-initialized. Correspondingly, :code:`scatter_nd(index, updates, shape)`
+    is equal to :code:`scatter_nd_add(paddle.zeros(shape, updates.dtype), index, updates)` .
+    If :attr:`index` has repeated elements, then the corresponding updates are accumulated.
+    Because of the numerical approximation issues, the different order of repeated elements
+    in :attr:`index` may cause different results. The specific calculation method can be
+    seen :code:`scatter_nd_add` . This op is the inverse of the :code:`gather_nd` op.
+
+    Args:
+        index (Tensor): The index input with ndim > 1 and index.shape[-1] <= len(shape).
+                          Its dtype should be int32 or int64 as it is used as indexes.
+        updates (Tensor): The updated value of scatter_nd op. Its dtype should be float32, float64.
+                            It must have the shape index.shape[:-1] + shape[index.shape[-1]:]
+        shape(tuple|list): Shape of output tensor.
+        name (str|None): The output Tensor name. If set None, the layer will be named automatically.
+
+    Returns:
+        output (Tensor): The output is a tensor with the same type as :attr:`updates` .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            index_data = np.array([[1, 1],
+                                   [0, 1],
+                                   [1, 3]]).astype(np.int64)
+            index = paddle.to_tensor(index_data)
+            updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
+            shape = [3, 5, 9, 10]
+
+            output = paddle.scatter_nd(index, updates, shape)
+    """
+    return scatter_nd_add(zeros(shape, updates.dtype), index, updates, name)
 
 
 def chunk(x, chunks, axis=0, name=None):
@@ -1722,8 +2767,7 @@ def chunk(x, chunks, axis=0, name=None):
             # out2.shape [3, 3, 5]
     """
     check_type(chunks, 'chunks', (int), 'chunk')
-    return paddle.fluid.layers.split(
-        input=x, num_or_sections=chunks, dim=axis, name=name)
+    return split(x, num_or_sections=chunks, axis=axis, name=name)
 
 
 def tile(x, repeat_times, name=None):
@@ -2136,7 +3180,124 @@ def reshape(x, shape, name=None):
             # the value is [10.]
 
     """
-    return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
+    actual_shape = None
+    act = None
+    inplace = False
+
+    if in_dygraph_mode():
+        tmp_tensor_type = core.eager.Tensor
+        #TODO(zhiqiu): enable inplace in dygraph mode.
+        if inplace:
+            warnings.warn(
+                "Inplace on reshape is not allowed and will be discarded in dygraph mode currently."
+            )
+        if isinstance(shape, (list, tuple)):
+            shape = [
+                item.numpy().item(0) if isinstance(item, Variable) else item
+                for item in shape
+            ]
+            out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+        elif isinstance(shape, tmp_tensor_type):
+            shape.stop_gradient = True
+            out, _ = _C_ops.reshape2(x, shape)
+        else:
+            raise ValueError(
+                "shape must be an instance of `list`, `tuple` or `Variable`,"
+                " got '{}.'".format(type(shape)))
+
+        return dygraph_utils._append_activation_in_dygraph(out, act)
+    else:
+        if _in_legacy_dygraph():
+            tmp_tensor_type = Variable
+            if inplace:
+                warnings.warn(
+                    "Inplace on reshape is not allowed and will be discarded in dygraph mode currently."
+                )
+            if isinstance(shape, (list, tuple)):
+                shape = [
+                    item.numpy().item(0) if isinstance(item, Variable) else item
+                    for item in shape
+                ]
+                out, _ = _C_ops.reshape2(x, None, 'shape', shape)
+            elif isinstance(shape, tmp_tensor_type):
+                shape.stop_gradient = True
+                out, _ = _C_ops.reshape2(x, shape)
+            else:
+                raise ValueError(
+                    "shape must be an instance of `list`, `tuple` or `Variable`,"
+                    " got '{}.'".format(type(shape)))
+
+            return dygraph_utils._append_activation_in_dygraph(out, act)
+
+    check_variable_and_dtype(x, 'x', [
+        'float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'bool',
+        'uint16'
+    ], 'reshape')
+    check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
+    check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
+
+    helper = LayerHelper("reshape2", **locals())
+
+    def get_attr_shape(list_shape):
+        unk_dim_idx = -1
+        attrs_shape = []
+        for dim_idx, dim_size in enumerate(list_shape):
+            if isinstance(dim_size, Variable):
+                attrs_shape.append(-1)
+            else:
+                attrs_shape.append(dim_size)
+                if dim_size == -1:
+                    assert unk_dim_idx == -1, (
+                        "Only one dimension value of 'shape' in reshape can "
+                        "be -1. But received shape[%d] is also -1.\n"
+                        "\n\t# N = x.shape()[2]\t\t# N is an int. "
+                        "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t"
+                        "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])"
+                        "\t# z.shape is [-1, -1, 4]\n\n"
+                        "    If your target shape in Reshape represents dynamic shape, "
+                        "please turn it into a Tensor under @to_static. See above example for details."
+                        % dim_idx)
+                    unk_dim_idx = dim_idx
+                elif dim_size == 0:
+                    assert dim_idx < len(x.shape), (
+                        "The index of 0 in `shape` must be less than "
+                        "the input tensor X's dimensions. "
+                        "But received shape[%d] = 0, X's dimensions = %d." %
+                        (dim_idx, len(x.shape)))
+                else:
+                    assert dim_size > 0, (
+                        "Each dimension value of 'shape' in reshape must not "
+                        "be negative except one unknown dimension. "
+                        "But received shape[%d] = %s." %
+                        (dim_idx, str(dim_size)))
+        return attrs_shape
+
+    inputs = {"X": x}
+    attrs = {}
+    if isinstance(shape, Variable):
+        shape.stop_gradient = True
+        inputs["Shape"] = shape
+    elif isinstance(shape, (list, tuple)):
+        assert len(shape) > 0, ("The size of 'shape' in reshape can't be zero, "
+                                "but received %s." % len(shape))
+        attrs["shape"] = get_attr_shape(shape)
+        if utils._contain_var(shape):
+            inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
+        elif isinstance(actual_shape, Variable):
+            actual_shape.stop_gradient = True
+            inputs["Shape"] = actual_shape
+
+    out = x if inplace else helper.create_variable_for_type_inference(
+        dtype=x.dtype)
+    x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type="reshape2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out,
+                 "XShape": x_shape})
+
+    return helper.append_activation(out)
 
 
 @inplace_apis_in_dygraph_only
@@ -2231,8 +3392,24 @@ def gather_nd(x, index, name=None):
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
     """
-
-    return paddle.fluid.layers.gather_nd(input=x, index=index, name=name)
+    if in_dygraph_mode():
+        return _C_ops.final_state_gather_nd(x, index)
+    else:
+        if _in_legacy_dygraph():
+            return _C_ops.gather_nd(x, index)
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
+        'gather_np')
+    check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
+    helper = LayerHelper('gather_nd', **locals())
+    dtype = helper.input_dtype()
+    output = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type="gather_nd",
+        inputs={"X": x,
+                "Index": index},
+        outputs={"Out": output})
+    return output
 
 
 def strided_slice(x, axes, starts, ends, strides, name=None):
@@ -2318,8 +3495,115 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
             # sliced_2 is x[:, 1:3:1, 0:2:1, 2:4:2].
     """
 
-    return paddle.fluid.layers.strided_slice(
-        input=x, axes=axes, starts=starts, ends=ends, strides=strides)
+    helper = LayerHelper('strided_slice', **locals())
+
+    check_variable_and_dtype(x, 'x',
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
+                             'strided_slice')
+    check_type(axes, 'axes', (list, tuple), 'strided_slice')
+    check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
+    check_type(ends, 'ends', (list, tuple, Variable), 'strided_slice')
+    check_type(strides, 'strides', (list, tuple, Variable), 'strided_slice')
+
+    def check_list_elements_dtype(list_input, input_name):
+        if isinstance(list_input, Variable):
+            check_dtype(list_input.dtype, input_name, ['int32'],
+                        'strided_slice')
+        else:
+            for i, var in enumerate(list_input):
+                var_name = input_name + '[' + str(i) + ']'
+                if isinstance(var, Variable):
+                    check_dtype(var.dtype, var_name, ['int32'], 'strided_slice')
+
+    check_list_elements_dtype(axes, 'axes')
+    check_list_elements_dtype(starts, 'starts')
+    check_list_elements_dtype(ends, 'ends')
+    check_list_elements_dtype(strides, 'strides')
+
+    def get_new_list_tensor(old_list):
+        new_list_tensor = []
+        for dim in old_list:
+            if isinstance(dim, Variable):
+                dim.stop_gradient = True
+                new_list_tensor.append(dim)
+            else:
+                assert (isinstance(dim, int))
+                temp_out = helper.create_variable_for_type_inference('int32')
+                fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
+                new_list_tensor.append(temp_out)
+        return new_list_tensor
+
+    inputs = {'Input': x}
+    attrs = {'axes': axes}
+    infer_flags = list(1 for i in range(len(axes)))
+
+    if _non_static_mode():
+        inputs = {'Input': x}
+        attrs = {
+            'axes': axes,
+            'starts': starts,
+            'ends': ends,
+            'strides': strides,
+            'infer_flags': infer_flags
+        }
+    else:
+        # starts
+        if isinstance(starts, Variable):
+            starts.stop_gradient = True
+            inputs['StartsTensor'] = starts
+        elif isinstance(starts, (list, tuple)):
+            attrs['starts'] = []
+            if utils._contain_var(starts):
+                inputs['StartsTensorList'] = get_new_list_tensor(starts)
+                for i, dim in enumerate(starts):
+                    if isinstance(dim, Variable):
+                        attrs['starts'].append(-1)
+                        infer_flags[i] = -1
+                    else:
+                        attrs['starts'].append(dim)
+            else:
+                attrs['starts'] = starts
+
+        # ends
+        if isinstance(ends, Variable):
+            ends.stop_gradient = True
+            inputs['EndsTensor'] = ends
+        elif isinstance(ends, (list, tuple)):
+            attrs['ends'] = []
+            if utils._contain_var(ends):
+                inputs['EndsTensorList'] = get_new_list_tensor(ends)
+                for i, dim in enumerate(ends):
+                    if isinstance(dim, Variable):
+                        attrs['ends'].append(-1)
+                        infer_flags[i] = -1
+                    else:
+                        attrs['ends'].append(dim)
+            else:
+                attrs['ends'] = ends
+
+        # strides
+        if isinstance(strides, Variable):
+            strides.stop_gradient = True
+            inputs['StridesTensor'] = strides
+        elif isinstance(strides, (list, tuple)):
+            attrs['strides'] = []
+            if utils._contain_var(strides):
+                inputs['StridesTensorList'] = get_new_list_tensor(strides)
+                for i, dim in enumerate(strides):
+                    if isinstance(dim, Variable):
+                        attrs['strides'].append(-1)
+                        infer_flags[i] = -1
+                    else:
+                        attrs['strides'].append(dim)
+            else:
+                attrs['strides'] = strides
+        attrs['infer_flags'] = infer_flags
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype('x'))
+    helper.append_op(
+        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out})
+
+    return out
 
 
 def tensordot(x, y, axes=2, name=None):
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 3d0617e40d6b6..b82f58ea3d087 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -16,7 +16,7 @@
 
 from ..framework import core
 from ..framework import convert_np_dtype_to_dtype_, dygraph_only
-from ..fluid.layer_helper import LayerHelper
+from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
 import paddle
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index b2fb9d6c37ff2..6855b8f0f7061 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -14,7 +14,7 @@
 from __future__ import print_function
 import numpy as np
 import paddle
-from ..fluid.layer_helper import LayerHelper
+from ..framework import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import layers
 from ..framework import core, in_dygraph_mode, _non_static_mode
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 89462e2a8721f..9863abe1becbb 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 from ..static import Variable
-from ..fluid.layer_helper import LayerHelper
+from ..framework import LayerHelper
 from ..framework import core
 from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 from .search import where

From ca4aea2c5cc79b593d3d8ec2d6a585f7a48ce208 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 13 Apr 2022 18:46:34 +0800
Subject: [PATCH 02/19] fix new dygraph record event (#41715)

* fix new dygraph record event

* refine name

* fix

* fix

* fix according to review
---
 paddle/fluid/eager/auto_code_generator/eager_generator.cc     | 2 +-
 .../final_state_generator/codegen_utils.py                    | 2 +-
 .../auto_code_generator/final_state_generator/eager_gen.py    | 4 ++--
 paddle/fluid/eager/backward.cc                                | 2 +-
 python/paddle/utils/code_gen/api_base.py                      | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 3ed17b67b842a..726e049e61150 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -2480,7 +2480,7 @@ static std::string GenerateGradNodeHeaderContents(
       "%s\n"
       "    SetIsTensorWrappersCleared(true);\n"
       "  }\n"
-      "  std::string name() override { return \" GradNode%s \"; } \n "
+      "  std::string name() override { return \"GradNode%sMid\"; } \n "
       "\n"
       "std::shared_ptr<GradNodeBase> Copy() const override {{\n "
       "    auto copied_node = std::shared_ptr<GradNode%s>(new "
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index ea7b4a21a2c54..6219ecee17f30 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -137,7 +137,7 @@ def RemoveConstAndReference(string):
 
 
 def GetGradNodeName(string):
-    return f"FinalGradNode{string}"
+    return f"GradNode{string}Final"
 
 
 def GetDygraphForwardFunctionName(string):
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index d6505ebaa1e68..bd31de520750d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -120,7 +120,7 @@ class {} : public egr::GradNodeBase {{
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
-  std::string name() override {{ return \" {} \"; }}
+  std::string name() override {{ return \"{}\"; }}
   
   void ClearTensorWrappers() override {{
       {}
@@ -804,7 +804,7 @@ def GenerateNodeCreationCodes(self):
         set_retain_grad_str = "\n".join(set_retain_grad_list)
 
         node_event_name = forward_api_name + " node_creation"
-        node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::Operator, 1);\n"
+        node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n"
 
         self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
             node_creation_event_str, pass_stop_gradient_args_str,
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 974acb8646ca5..3b555eda8fff7 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -643,7 +643,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     VLOG(6) << "Running GradNode:" << node->name();
 
     paddle::platform::RecordEvent node_record_event(
-        std::string(typeid(*node).name()) + " grad_node",
+        std::string((*node).name()) + " grad_node",
         paddle::platform::TracerEventType::Operator, 1);
 
     if (queue.size() > 1 && node_in_degree_map[node] != 0) {
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 4325807746e7c..9aa3fc9eafe33 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -744,7 +744,7 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  {{
-{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::OperatorInner, 1);
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
@@ -771,7 +771,7 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}  using kernel_signature = {kernel_signature};
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  {{
-{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1);
+{code_indent}    paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::OperatorInner, 1);
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 

From c9c03e7b41254f3fe267b2140b21da62739e713f Mon Sep 17 00:00:00 2001
From: zmxdream <zhangminxu01@baidu.com>
Date: Wed, 13 Apr 2022 19:53:40 +0800
Subject: [PATCH 03/19] [XPUPS]add support for kunlun2 (#40985)

[XPUPS]add support for kunlun2

Co-authored-by: WorgenZhang <frank08081993@gmail.com>
---
 paddle/fluid/framework/fleet/heter_context.h  |   2 +-
 .../framework/fleet/heter_ps/CMakeLists.txt   |   4 +-
 .../framework/fleet/heter_ps/feature_value.h  |  24 +-
 .../framework/fleet/heter_ps/hashtable.h      |  98 ++-
 .../{hashtable_inl.h => hashtable_kernel.cu}  |  82 +-
 .../fleet/heter_ps/hashtable_kernel.kps       | 346 ++++++++
 .../framework/fleet/heter_ps/heter_comm.h     | 100 ++-
 .../framework/fleet/heter_ps/heter_comm_inl.h | 764 +++++++++++-------
 .../fleet/heter_ps/heter_comm_kernel.cu       | 269 ++++++
 .../fleet/heter_ps/heter_comm_kernel.h        |  86 ++
 .../fleet/heter_ps/heter_comm_kernel.kps      | 351 ++++++++
 .../framework/fleet/heter_ps/heter_ps.cu      |   8 +
 .../fluid/framework/fleet/heter_ps/heter_ps.h |   8 +
 .../framework/fleet/heter_ps/heter_ps_base.h  |   2 +
 .../fleet/heter_ps/heter_resource.cc          |  65 +-
 .../framework/fleet/heter_ps/heter_resource.h |  62 +-
 .../fluid/framework/fleet/heter_ps/mem_pool.h |   2 +
 .../framework/fleet/heter_ps/optimizer.cuh.h  |  20 +-
 .../framework/fleet/heter_ps/optimizer_conf.h |  30 +-
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h |   6 +-
 20 files changed, 1938 insertions(+), 391 deletions(-)
 mode change 100755 => 100644 paddle/fluid/framework/fleet/heter_ps/hashtable.h
 rename paddle/fluid/framework/fleet/heter_ps/{hashtable_inl.h => hashtable_kernel.cu} (75%)
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
 create mode 100644 paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps

diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 6d3a4c5d9c0b9..3fdcf2379cb54 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <vector>
 
 #ifdef PADDLE_WITH_PSLIB
-#include "common_value.h"  // NOLINT
+#include "common/common_value.h"  // NOLINT
 #endif
 
 #ifdef PADDLE_WITH_PSCORE
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 983208c0608ae..cac366d6b22a1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,7 +7,9 @@ IF(WITH_GPU)
         get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
         SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
     endif()
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
+    nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index db11fca109bc3..b633394e7a811 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -52,18 +52,18 @@ struct FeaturePushValue {
   float lr_g;
   float mf_g[MF_DIM];
 
-  __device__ __forceinline__ FeaturePushValue
-  operator+(const FeaturePushValue& a) const {
-    FeaturePushValue out;
-    out.slot = a.slot;
-    out.show = a.show + show;
-    out.clk = a.clk + clk;
-    out.lr_g = a.lr_g + lr_g;
-    for (int i = 0; i < MF_DIM; ++i) {
-      out.mf_g[i] = a.mf_g[i] + mf_g[i];
-    }
-    return out;
-  }
+  // __device__ __forceinline__ FeaturePushValue
+  // operator+(const FeaturePushValue& a) const {
+  //  FeaturePushValue out;
+  //  out.slot = a.slot;
+  //  out.show = a.show + show;
+  //  out.clk = a.clk + clk;
+  //  out.lr_g = a.lr_g + lr_g;
+  //  for (int i = 0; i < MF_DIM; ++i) {
+  //    out.mf_g[i] = a.mf_g[i] + mf_g[i];
+  //  }
+  //  return out;
+  // }
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
old mode 100755
new mode 100644
index e8eb91f6f6b14..6a51713d74c19
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -13,28 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_HETERPS
 #include <glog/logging.h>
 #include <limits>
 #include <memory>
 #include <vector>
+
 #ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
 #endif
-#ifdef PADDLE_WITH_PSCORE
+
+#if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
 #endif
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/phi/core/utils/rw_lock.h"
-#include "thrust/pair.h"
-// #include "cudf/concurrent_unordered_map.cuh.h"
+
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/mem_pool.h"
-#ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "thrust/pair.h"
+#elif defined(__xpu__)
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/math.h"
+#include "xpu/kernel/simd.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 template <typename KeyType, typename ValType>
 class TableContainer
     : public concurrent_unordered_map<KeyType, ValType,
@@ -45,31 +55,84 @@ class TableContainer
                                  std::numeric_limits<KeyType>::max()>(
             capacity, ValType()) {}
 };
+#elif defined(PADDLE_WITH_XPU_KP)
+
+template <typename KeyType, typename ValType>
+class XPUCacheArray {
+ public:
+  explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) {
+    xpu_malloc(reinterpret_cast<void**>(&keys), capacity_ * sizeof(KeyType));
+    xpu_malloc(reinterpret_cast<void**>(&vals), capacity_ * sizeof(ValType));
+  }
+
+  virtual ~XPUCacheArray() {
+    xpu_free(keys);
+    xpu_free(vals);
+  }
+
+  void print() {}
+  // ValType* find(const KeyType& key) { return NULL; }
+  // bool insert(const KeyType& key, const ValType& val) { return true; }
+
+  int prefetch(const int dev_id, XPUStream stream = NULL) {}
+  size_t size() { return size_; }
+
+ private:
+  long long capacity_;
+  long long size_;
+  KeyType* keys;
+  ValType* vals;
+};
+#endif
 
 template <typename KeyType, typename ValType>
 class HashTable {
  public:
-  HashTable(size_t capacity);
+  explicit HashTable(size_t capacity);
   virtual ~HashTable();
   HashTable(const HashTable&) = delete;
   HashTable& operator=(const HashTable&) = delete;
+
+  template <typename StreamType>
   void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
-              gpuStream_t stream);
+              StreamType stream);
+
+  template <typename StreamType>
   void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
-              gpuStream_t stream);
+              StreamType stream);
+
+  template <typename StreamType>
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
-           gpuStream_t stream);
-  void get(const KeyType* d_keys, char* d_vals, size_t len, gpuStream_t stream);
+           StreamType stream);
+
+  template <typename StreamType>
+  void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream);
+
   void show();
-  void dump_to_cpu(int devid, cudaStream_t stream);
 
-  template <typename GradType, typename Sgd>
+  template <typename StreamType>
+  void dump_to_cpu(int devid, StreamType stream);
+
+#if defined(PADDLE_WITH_CUDA)
+
+  template <typename GradType, typename Sgd, typename StreamType>
   void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
-              Sgd sgd, gpuStream_t stream);
+              Sgd sgd, StreamType stream);
 
-  template <typename Sgd>
+  template <typename Sgd, typename StreamType>
   void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
-              gpuStream_t stream);
+              StreamType stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  template <typename GradType, typename StreamType>
+  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
+              StreamType stream);
+
+  template <typename StreamType>
+  void update(const KeyType* d_keys, const char* d_grads, size_t len,
+              StreamType stream);
+
+#endif
 
   int size() { return container_->size(); }
 
@@ -84,7 +147,11 @@ class HashTable {
   std::unique_ptr<phi::RWLock> rwlock_{nullptr};
 
  private:
+#if defined(PADDLE_WITH_CUDA)
   TableContainer<KeyType, ValType>* container_;
+#elif defined(PADDLE_WITH_XPU_KP)
+  XPUCacheArray<KeyType, ValType>* container_;
+#endif
   int BLOCK_SIZE_{256};
   float LOAD_FACTOR{0.75f};
   size_t capacity_;
@@ -94,5 +161,4 @@ class HashTable {
 };
 }  // end namespace framework
 }  // end namespace paddle
-#include "hashtable_inl.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
similarity index 75%
rename from paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
rename to paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 0297e71c35e27..cac1b9c17e077 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
+#include <thread>
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
+
 template <typename value_type>
 struct ReplaceOp {
   __host__ __device__ value_type operator()(value_type new_value,
@@ -87,6 +92,7 @@ __global__ void dy_mf_search_kernel(Table* table,
     }
   }
 }
+
 template <typename Table, typename GradType, typename Sgd>
 __global__ void update_kernel(Table* table,
                               const typename Table::key_type* const keys,
@@ -135,8 +141,9 @@ void HashTable<KeyType, ValType>::show() {
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
-                                      size_t len, gpuStream_t stream) {
+                                      size_t len, StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -146,8 +153,9 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
-                                      size_t len, gpuStream_t stream) {
+                                      size_t len, StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -157,9 +165,10 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                          const ValType* d_vals, size_t len,
-                                         gpuStream_t stream) {
+                                         StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -169,22 +178,24 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
 }
 
 template <typename KeyType, typename ValType>
+template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
                                          char* pool, size_t start_index,
-                                         gpuStream_t stream) {
+                                         StreamType stream) {
   if (len == 0) {
     return;
   }
-  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   if (pool == NULL) {
     return;
   }
+  const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
                                                        pool, start_index);
 }
 
 template <typename KeyType, typename ValType>
-void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
+template <typename StreamType>
+void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
   container_->prefetch(cudaCpuDeviceId, stream);
   std::vector<std::thread> threads;
   size_t num = container_->size();
@@ -260,10 +271,10 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
 }
 
 template <typename KeyType, typename ValType>
-template <typename GradType, typename Sgd>
+template <typename GradType, typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                          const GradType* d_grads, size_t len,
-                                         Sgd sgd, gpuStream_t stream) {
+                                         Sgd sgd, StreamType stream) {
   if (len == 0) {
     return;
   }
@@ -273,19 +284,66 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 }
 
 template <typename KeyType, typename ValType>
-template <typename Sgd>
+template <typename Sgd, typename StreamType>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                          const char* d_grads, size_t len,
-                                         Sgd sgd, gpuStream_t stream) {
+                                         Sgd sgd, StreamType stream) {
   if (len == 0) {
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-
   dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
       container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
 }
 
+template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
+    cudaStream_t>(const unsigned long* d_keys,
+                  paddle::framework::FeatureValue* d_vals, size_t len,
+                  cudaStream_t stream);
+
+// template void
+// HashTable<unsigned long, paddle::framework::FeatureValue>::get<cudaStream_t>(
+//    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
+//    stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
+    cudaStream_t>(const unsigned long* d_keys,
+                  const paddle::framework::FeatureValue* d_vals, size_t len,
+                  cudaStream_t stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::insert<
+//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
+//                  size_t start_index, cudaStream_t stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::
+    dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
+    paddle::framework::FeaturePushValue,
+    Optimizer<paddle::framework::FeatureValue,
+              paddle::framework::FeaturePushValue>,
+    cudaStream_t>(const unsigned long* d_keys,
+                  const paddle::framework::FeaturePushValue* d_grads,
+                  size_t len, Optimizer<paddle::framework::FeatureValue,
+                                        paddle::framework::FeaturePushValue>
+                                  sgd,
+                  cudaStream_t stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::update<
+//    Optimizer<paddle::framework::FeatureValue,
+//              paddle::framework::FeaturePushValue>,
+//    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t
+//    len,
+//                  Optimizer<paddle::framework::FeatureValue,
+//                            paddle::framework::FeaturePushValue>
+//                      sgd,
+//                  cudaStream_t stream);
+
+#endif
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
new file mode 100644
index 0000000000000..9d2a20a361e31
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps
@@ -0,0 +1,346 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+
+namespace optimizer_config {
+extern _global_ptr_ float* nonclk_coeff;
+extern _global_ptr_ float* clk_coeff;
+
+extern _global_ptr_ float* min_bound;
+extern _global_ptr_ float* max_bound;
+extern _global_ptr_ float* learning_rate;
+extern _global_ptr_ float* initial_g2sum;
+extern _global_ptr_ float* initial_range;
+
+extern _global_ptr_ float* mf_create_thresholds;
+extern _global_ptr_ float* mf_learning_rate;
+extern _global_ptr_ float* mf_initial_g2sum;
+extern _global_ptr_ float* mf_initial_range;
+extern _global_ptr_ float* mf_min_bound;
+extern _global_ptr_ float* mf_max_bound;
+}
+
+namespace paddle {
+namespace framework {
+
+#if defined(PADDLE_WITH_XPU_KP)
+
+__device__ void update_lr(float* w, float* g2sum, float g,  // NOLINT
+                          float scale) {
+  __local__ float local_learning_rate;
+  __local__ float local_initial_g2sum;
+  __local__ float local_min_bound;
+  __local__ float local_max_bound;
+
+  GM2LM(optimizer_config::learning_rate, &local_learning_rate, sizeof(float));
+  GM2LM(optimizer_config::initial_g2sum, &local_initial_g2sum, sizeof(float));
+  GM2LM(optimizer_config::min_bound, &local_min_bound, sizeof(float));
+  GM2LM(optimizr_config::max_bound, &local_max_bound, sizeof(float));
+
+  double add_g2sum = 0;
+  double ratio = local_learning_rate *
+                 sqrt(local_initial_g2sum / (local_initial_g2sum + g2sum));
+  double scaled_grad = g / scale;
+
+  (*w) += scaled_grad * ratio;
+
+  if (w < local_min_bound) w = local_min_bound;
+  if (w > local_max_bound) w = local_max_bound;
+
+  add_g2sum += scaled_grad * scaled_grad;
+
+  (*g2sum) += add_g2sum;
+}
+
+__device__ void update_mf(int n, float* w, float* g2sum, const float* g,
+                          float scale) {
+  __local__ float local_mf_learning_rate;
+  __local__ float local_mf_initial_g2sum;
+  __local__ float local_mf_min_bound;
+  __local__ float local_mf_max_bound;
+
+  GM2LM(optimizer_config::mf_learning_rate, &local_mf_learning_rate,
+        sizeof(float));
+  GM2LM(optimizer_config::mf_initial_g2sum, &local_mf_initial_g2sum,
+        sizeof(float));
+  GM2LM(optimizer_config::mf_min_bound, &local_mf_min_bound, sizeof(float));
+  GM2LM(optimizer_config::mf_max_bound, &local_mf_max_bound, sizeof(float));
+
+  double add_g2sum = 0;
+  double ratio =
+      local_mf_learning_rate *
+      sqrt(local_mf_initial_g2sum / (local_mf_initial_g2sum + g2sum));
+  for (int i = 0; i < n; ++i) {
+    double scaled_grad = g[i] / scale;
+    w[i] += scaled_grad * ratio;
+
+    if (w[i] < local_mf_min_bound) w[i] = local_mf_min_bound;
+    if (w[i] > local_mf_max_bound) w[i] = local_mf_max_bound;
+    add_g2sum += scaled_grad * scaled_grad;
+  }
+
+  (*g2sum) += add_g2sum / n;
+}
+
+__device__ float xpu_rand_uniform() { return 0.1; }
+
+template <typename ValType, typename GradType>
+__device__ void update_value(ValType* val, const GradType* grad) {  // NOLINT
+  (*val).slot = (*grad).slot;
+  (*val).show += (*grad).show;
+  (*val).clk += (*grad).clk;
+
+  __local__ float local_nonclk_coeff;
+  __local__ float local_clk_coeff;
+
+  __local__ float local_mf_create_thresholds;
+  __local__ float local_mf_initial_range;
+
+  GM2LM(optimizer_config::nonclk_coeff, &local_nonclk_coeff, sizeof(float));
+  GM2LM(optimizer_config::clk_coeff, &local_clk_coeff, sizeof(float));
+  GM2LM(optimizer_config::mf_create_thresholds, &local_mf_create_thresholds,
+        sizeof(float));
+
+  val.delta_score += local_nonclk_coeff * ((*grad).show - (*grad).clk) +
+                     local_clk_coeff * (*grad).clk;
+
+  update_lr(&(*val).lr, &(*val).lr_g2sum, (*grad).lr_g, (*grad).show);
+
+  if (val.mf_size == 0) {
+    if (local_mf_create_thresholds <=
+        local_nonclk_coeff * ((*val).show - (*val).clk) +
+            local_clk_coeff * (*val).clk) {
+      val.mf_size = MF_DIM + 1;
+      val.mf[0] = 0;
+
+      xpu_rand_uniform(&);
+      for (int i = 0; i < MF_DIM; ++i) {
+        (*val).mf[i + 1] = (xpu_rand_uniform()) * local_mf_initial_range;
+      }
+    }
+  } else {
+    update_mf(MF_DIM, &val.mf[1], &val.mf[0], (*grad).mf_g, (*grad).show);
+  }
+}
+
+template <typename KeyType, typename ValType, typename Table>
+__global__ void insert_kernel(Table* table, const KeyType* const keys,
+                              const ValType* const vals, size_t len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 150;
+  __local__ KeyType local_keys[buf_size];
+  __local__ ValType local_vals[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(keys, local_keys, read_len * sizeof(KeyType));
+    GM2LM(vals, local_vals, read_len * sizeof(ValType));
+    for (int k = 0; k < read_len; k++) {
+      // auto status = table->insert(local_keys[k], local_vals[k]);
+      // assert(status != false && "error: insert fails: table is full");
+    }
+  }
+}
+
+template <typename KeyType, typename ValType, typename Table>
+__global__ void search_kernel(Table* table, const KeyType* const keys,
+                              ValType* const vals, size_t len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 150;
+  __local__ KeyType local_keys[buf_size];
+  __local__ ValType local_vals[buf_size];
+
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(keys, local_keys, read_len * sizeof(KeyType));
+    for (int k = 0; k < read_len; k++) {
+      // ValType* val = table->find(local_keys[k]);
+      // if (val != NULL) {
+      //  local_vals[k] = *val;
+      // }
+    }
+    LM2GM(local_vals, vals + i, read_len * sizeof(ValType));
+  }
+}
+
+template <typename KeyType, typename ValType, typename Table, typename GradType>
+__global__ void update_kernel(Table* table, const KeyType* const keys,
+                              const GradType* const grads, size_t len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 250;
+  __local__ KeyType local_keys[buf_size];
+  __local__ GradType local_grads[buf_size];
+
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+
+    GM2LM(keys, local_keys, read_len * sizeof(KeyType));
+    GM2LM(grads, local_grads, read_len * sizeof(GradType));
+
+    for (int k = 0; k < read_len; k++) {
+      // ValType* val = table->find(local_keys[k]);
+      // if (val != NULL) {
+      //  update_value(*val, grads[i]);
+      //}
+    }
+  }
+}
+
+template <typename KeyType, typename ValType>
+HashTable<KeyType, ValType>::HashTable(size_t capacity) {
+  auto tmp_container = XPUCacheArray<KeyType, ValType>(capacity);
+  xpu_malloc(reinterpret_cast<void**>(&container_),
+             sizeof(XPUCacheArray<KeyType, ValType>));
+  xpu_memcpy(container_, &tmp_container,
+             sizeof(XPUCacheArray<KeyType, ValType>), XPU_HOST_TO_DEVICE);
+  rwlock_.reset(new phi::RWLock);
+}
+
+template <typename KeyType, typename ValType>
+HashTable<KeyType, ValType>::~HashTable() {
+  xpu_free((void*)container_);
+}
+
+template <typename KeyType, typename ValType>
+void HashTable<KeyType, ValType>::show() {
+  container_->print();
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
+                                      size_t len, StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  search_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
+                                      size_t len, StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  // TODO(zhangminxu): to be implemented
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
+                                         const ValType* d_vals, size_t len,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  insert_kernel<<<4, 64, stream>>>(container_, d_keys, d_vals, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
+  // TODO(zhangminxu): to be implemented
+}
+
+template <typename KeyType, typename ValType>
+template <typename GradType, typename StreamType>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const GradType* d_grads, size_t len,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  update_kernel<<<4, 64, stream>>>(container_, d_keys, d_grads, len);
+}
+
+template <typename KeyType, typename ValType>
+template <typename StreamType>
+void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
+                                         const char* d_grads, size_t len,
+                                         StreamType stream) {
+  if (len == 0) {
+    return;
+  }
+  // TODO(zhangminxu): to be implemented
+}
+
+template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
+    XPUStream>(const unsigned long* d_keys,
+               paddle::framework::FeatureValue* d_vals, size_t len,
+               XPUStream stream);
+
+// template void
+// HashTable<unsigned long, paddle::framework::FeatureValue>::get<XPUStream>(
+//    const unsigned long* d_keys, char* d_vals, size_t len, XPUStream stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
+    XPUStream>(const unsigned long* d_keys,
+               const paddle::framework::FeatureValue* d_vals, size_t len,
+               XPUStream stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::insert<
+//    XPUStream>(const unsigned long* d_keys, size_t len, char* pool,
+//               size_t start_index, XPUStream stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::
+    dump_to_cpu<XPUStream>(int devid, XPUStream stream);
+
+template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
+    paddle::framework::FeaturePushValue, XPUStream>(
+    const unsigned long* d_keys,
+    const paddle::framework::FeaturePushValue* d_grads, size_t len,
+    XPUStream stream);
+
+// template void HashTable<unsigned long,
+// paddle::framework::FeatureValue>::update<
+//    XPUStream>(const unsigned long* d_keys, const char* d_grads,
+//                          size_t len, XPUStream stream);
+
+#endif
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 1fca8cdf8bb80..817fd8d38ee06 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -15,39 +15,28 @@ limitations under the License. */
 #pragma once
 #include <thread>
 #include <vector>
-#include "cub/cub.cuh"
-#include "cub/util_allocator.cuh"
-#include "hashtable.h"       // NOLINT
-#include "heter_resource.h"  // NOLINT
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-#include "paddle/fluid/platform/place.h"
 #include "thrust/pair.h"
+#elif defined(PADDLE_WITH_XPU_KP)
+#include <xpu/runtime.h>
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#endif
+
+#include "paddle/fluid/framework/fleet/heter_ps/hashtable.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/place.h"
 
 #ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
 
-struct CustomGradMerger {
-  template <typename T>
-  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
-  operator()(const T& a, const T& b) const {
-    T out;
-    out.slot = a.slot;
-    out.show = a.show + b.show;
-    out.clk = a.clk + b.clk;
-    out.lr_g = a.lr_g + b.lr_g;
-    for (int i = 0; i < MF_DIM; ++i) {
-      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
-    }
-    return out;
-  }
-};
-
 template <typename KeyType, typename ValType, typename GradType>
 class HeterComm {
  public:
@@ -67,10 +56,21 @@ class HeterComm {
   void show_one_table(int gpu_num);
   int get_index_by_devid(int devid);
 
+#if defined(PADDLE_WITH_CUDA)
   template <typename Sgd>
   void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len,
                    Sgd& sgd);  // NOLINT
+#elif defined(PADDLE_WITH_XPU_KP)
+  void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len);
+#endif
+
+  int log2i(int x);
 
+  template <typename DstPlace, typename SrcPlace, typename StreamType>
+  void memory_copy(DstPlace dst_place, void* dst, SrcPlace src_place,
+                   const void* src, size_t count, StreamType stream = 0);
+
+#if defined(PADDLE_WITH_CUDA)
   template <typename Sgd>
   void push_sparse_multi_node(int num, KeyType* d_keys, GradType* d_grads,
                               size_t len, Sgd& sgd);  // NOLINT
@@ -85,8 +85,6 @@ class HeterComm {
   int gather_multi_node_grad(int num, KeyType* d_keys, GradType* d_grads,
                              int len);
 
-  int log2i(int x);
-
   void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                               const std::vector<ncclComm_t>& inter_comms,
                               int comm_size) {
@@ -101,19 +99,21 @@ class HeterComm {
 
   // void dump_to_cpu(int index);
 
-  void end_pass();
-
   int get_transfer_devid(int send_id) { return (send_id + 4) % 8; }
 
+#endif
+
+  void end_pass();
+
   struct Node {
-    cudaStream_t in_stream;
-    cudaStream_t out_stream;
+    ppStream in_stream;
+    ppStream out_stream;
     char* key_storage;
     char* val_storage;
     int sync;
     int key_bytes_len;
     int val_bytes_len;
-    int gpu_num;
+    int dev_num;
   };
 
   struct Path {
@@ -133,7 +133,7 @@ class HeterComm {
       alloc(size, true);
     }
 
-    void alloc(int size, bool force = false) {
+    void alloc(size_t size, bool force = false) {
       if (force || size > all_keys_mem->size()) {
         all_keys_mem.reset();
         all_grads_mem.reset();
@@ -152,7 +152,11 @@ class HeterComm {
       }
     }
 
+#if defined(PADDLE_WITH_CUDA)
     platform::CUDAPlace place_;
+#elif defined(PADDLE_WITH_XPU_KP)
+    platform::XPUPlace place_;
+#endif
     std::shared_ptr<memory::Allocation> all_keys_mem;
     std::shared_ptr<memory::Allocation> all_grads_mem;
     KeyType* all_keys;
@@ -166,6 +170,33 @@ class HeterComm {
 
   void init_path();
 
+  template <typename StreamType>
+  void sync_stream(const StreamType& stream) {
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_XPU_KP)
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(stream));
+#endif
+  }
+
+  template <typename StreamType>
+  void create_stream(StreamType* stream) {
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(stream));
+#elif defined(PADDLE_WITH_XPU_KP)
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(stream));
+#endif
+  }
+
+  template <typename StreamType>
+  void destroy_stream(StreamType stream) {
+#if defined(PADDLE_WITH_CUDA)
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+#elif defined(PADDLE_WITH_XPU_KP)
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(stream));
+#endif
+  }
+
   void create_storage(int start_index, int end_index, int keylen, int vallen);
   void destroy_storage(int start_index, int end_index);
   void walk_to_dest(int start_index, int gpu_num, int* h_left, int* h_right,
@@ -182,15 +213,18 @@ class HeterComm {
   int block_size_{256};
 
  private:
+  std::unique_ptr<HeterCommKernel> heter_comm_kernel_;
   std::vector<LocalStorage> storage_;
-  CustomGradMerger merger_;
   int topo_aware_{0};
   int feanum_{1800 * 2048};
   int multi_node_{0};
+  int node_size_;
+
+#if defined(PADDLE_WITH_CUDA)
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
-  int node_size_;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
+#endif
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index f85ed330dc8ea..3ced33b490d59 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -13,115 +13,46 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_HETERPS
-//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include <queue>
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_XPU_KP
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
-template <typename T>
-__global__ void fill_idx(T* idx, size_t len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    idx[i] = i;
-  }
-}
-
-template <typename T>
-void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) {
-  T tmp[len];  // NOLINT
-  cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream);
-  cudaStreamSynchronize(stream);
-  std::cout << name;
-  for (int i = 0; i < len; ++i) {
-    std::cout << ":" << tmp[i];
-  }
-  std::cout << std::endl;
-}
-
-template <typename T>
-__global__ void calc_shard_offset(T* idx, T* left, T* right, size_t len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len - 1) {
-    if (idx[i] != idx[i + 1]) {
-      right[idx[i]] = i;
-      left[idx[i + 1]] = i + 1;
-    }
-  }
-  if (i == 0) {
-    left[idx[i]] = i;
-  }
-  if (i == (len - 1)) {
-    right[idx[i]] = i;
-  }
-}
-
-template <typename KeyType, typename T>
-__global__ void calc_shard_index(KeyType* d_keys, size_t len, T* shard_index,
-                                 int total_gpu) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    shard_index[i] = d_keys[i] % total_gpu;
-  }
-}
-
-template <typename KeyType, typename T>
-__global__ void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx,
-                               size_t len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    d_shard_keys[i] = d_keys[idx[i]];
-  }
-}
-
-template <typename KeyType, typename GradType, typename T>
-__global__ void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
-                                 GradType* d_shard_grads, GradType* d_grads,
-                                 T* idx, size_t len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    d_shard_keys[i] = d_keys[idx[i]];
-    d_shard_grads[i] = d_grads[idx[i]];
-  }
-}
-
-template <typename ValType, typename T>
-__global__ void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
-                           size_t len) {
-  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < len) {
-    d_vals[idx[i]] = d_shard_vals[i];
-  }
-}
-
 template <typename KeyType, typename ValType, typename GradType>
 HeterComm<KeyType, ValType, GradType>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
   resource_ = resource;
-  storage_.resize(resource_->total_gpu());
-  for (int i = 0; i < resource_->total_gpu(); ++i) {
+  storage_.resize(resource_->total_device());
+  for (int i = 0; i < resource_->total_device(); ++i) {
+#if defined(PADDLE_WITH_CUDA)
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
         8, 1, (unsigned int)-1, (size_t)-1, false, false));  // NOLINT
+#endif
     auto table = new Table(capacity / load_factor_);
     tables_.push_back(table);
     if (multi_node_) {
       storage_[i].init(feanum_, resource_->dev_id(i));
     }
   }
+  heter_comm_kernel_ = std::make_unique<HeterCommKernel>(block_size_);
   init_path();
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::init_path() {
-  int total_gpu = resource_->total_gpu();
-  path_.resize(total_gpu);
-
+  int total_device = resource_->total_device();
+  path_.resize(total_device);
   if (!topo_aware_) {
     VLOG(0) << "init path without topo aware";
-    for (int i = 0; i < total_gpu; ++i) {
-      path_[i].resize(total_gpu);
-      for (int j = 0; j < total_gpu; ++j) {
+    for (int i = 0; i < total_device; ++i) {
+      path_[i].resize(total_device);
+      for (int j = 0; j < total_device; ++j) {
         auto& nodes = path_[i][j].nodes_;
         nodes.resize(1);
         nodes[0].in_stream = resource_->comm_stream(i, j);
@@ -129,17 +60,18 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
         nodes[0].key_storage = NULL;
         nodes[0].val_storage = NULL;
         nodes[0].sync = 0;
-        nodes[0].gpu_num = j;
+        nodes[0].dev_num = j;
       }
     }
   } else {
     VLOG(0) << "init path with topo aware";
-    for (int i = 0; i < total_gpu; ++i) {
-      path_[i].resize(total_gpu);
-      for (int j = 0; j < total_gpu; ++j) {
+    for (int i = 0; i < total_device; ++i) {
+      path_[i].resize(total_device);
+      for (int j = 0; j < total_device; ++j) {
         auto& nodes = path_[i][j].nodes_;
         int from = resource_->dev_id(i);
         int to = resource_->dev_id(j);
+
         int transfer_id = i;
         if (need_transfer(from, to)) {
           transfer_id = resource_->get_index_by_devid(get_transfer_devid(from));
@@ -150,7 +82,7 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
           node.key_storage = NULL;
           node.val_storage = NULL;
           node.sync = 1;
-          node.gpu_num = transfer_id;
+          node.dev_num = transfer_id;
         }
         nodes.push_back(Node());
         Node& node = nodes.back();
@@ -159,148 +91,222 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
         node.key_storage = NULL;
         node.val_storage = NULL;
         node.sync = 0;
-        node.gpu_num = j;
+        node.dev_num = j;
       }
     }
   }
 }
 
+template <typename KeyType, typename ValType, typename GradType>
+template <typename DstPlace, typename SrcPlace, typename StreamType>
+void HeterComm<KeyType, ValType, GradType>::memory_copy(
+    DstPlace dst_place, void* dst, SrcPlace src_place, const void* src,
+    size_t count, StreamType stream) {
+#if defined(PADDLE_WITH_CUDA)
+  cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
+  if (stream == 0) {
+    cudaStreamSynchronize(0);
+  }
+#elif defined(PADDLE_WITH_XPU_KP)
+  memory::Copy(dst_place, dst, src_place, src, count);
+#endif
+}
+
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
                                                            int end_index,
                                                            int keylen,
                                                            int vallen) {
+#if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
   for (size_t i = 0; i < nodes.size(); ++i) {
-    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num));
+    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
     allocator->DeviceAllocate(
-        resource_->dev_id(nodes[i].gpu_num),
+        resource_->dev_id(nodes[i].dev_num),
         (void**)&(nodes[i].key_storage),  // NOLINT
-        keylen, resource_->remote_stream(nodes[i].gpu_num, start_index));
+        keylen, resource_->remote_stream(nodes[i].dev_num, start_index));
     allocator->DeviceAllocate(
-        resource_->dev_id(nodes[i].gpu_num),
+        resource_->dev_id(nodes[i].dev_num),
         (void**)&(nodes[i].val_storage),  // NOLINT
-        vallen, resource_->remote_stream(nodes[i].gpu_num, start_index));
-
+        vallen, resource_->remote_stream(nodes[i].dev_num, start_index));
+    nodes[i].key_bytes_len = keylen;
+    nodes[i].val_bytes_len = vallen;
+  }
+#elif defined(PADDLE_WITH_XPU_KP)
+  auto& nodes = path_[start_index][end_index].nodes_;
+  for (size_t i = 0; i < nodes.size(); ++i) {
+    platform::XPUDeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
+    auto place = DevPlace(resource_->dev_id(nodes[i].dev_num));
+    auto node_keys_mem = memory::Alloc(place, keylen);
+    nodes[i].key_storage = reinterpret_cast<char*>(node_keys_mem->ptr());
+    auto node_vals_mem = memory::Alloc(place, vallen);
+    nodes[i].val_storage = reinterpret_cast<char*>(node_vals_mem->ptr());
     nodes[i].key_bytes_len = keylen;
     nodes[i].val_bytes_len = vallen;
   }
+#endif
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
                                                             int end_index) {
+#if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
   for (size_t i = 0; i < nodes.size(); ++i) {
-    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].gpu_num));
+    platform::CUDADeviceGuard guard(resource_->dev_id(nodes[i].dev_num));
 
-    allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num),
+    allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num),
                           nodes[i].key_storage);
-    allocator->DeviceFree(resource_->dev_id(nodes[i].gpu_num),
+    allocator->DeviceFree(resource_->dev_id(nodes[i].dev_num),
                           nodes[i].val_storage);
+#endif
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
-    int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key,
-    GradType* src_val) {
+void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
+                                                         int num, int* h_left,
+                                                         int* h_right,
+                                                         KeyType* src_key,
+                                                         GradType* src_val) {
   int need_copy_val = 0;
   if (src_val) {
     need_copy_val = 1;
   }
   std::queue<CopyTask> que;
-  for (int i = 0; i < gpu_num; i++) {
+  for (int i = 0; i < num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    int size = path_[start_index][i].nodes_.size();
+    // int size = path_[start_index][i].nodes_.size();
     auto& node = path_[start_index][i].nodes_[0];
+
     CopyTask t(&path_[start_index][i], 0);
     que.push(t);
-    cudaMemcpyAsync(node.key_storage,
-                    reinterpret_cast<char*>(src_key + h_left[i]),
-                    node.key_bytes_len, cudaMemcpyDefault, node.in_stream);
+    auto src_dev_id = resource_->dev_id(start_index);
+    auto dst_dev_id = resource_->dev_id(i);
+    auto src_place = DevPlace(src_dev_id);
+    auto dst_place = DevPlace(dst_dev_id);
+
+    memory_copy(dst_place, node.key_storage, src_place,
+                reinterpret_cast<char*>(src_key + h_left[i]),
+                node.key_bytes_len, node.in_stream);
     if (need_copy_val) {
-      cudaMemcpyAsync(node.val_storage,
-                      reinterpret_cast<char*>(src_val + h_left[i]),
-                      node.val_bytes_len, cudaMemcpyDefault, node.in_stream);
+      memory_copy(dst_place, node.val_storage, src_place,
+                  reinterpret_cast<char*>(src_val + h_left[i]),
+                  node.val_bytes_len, node.in_stream);
     }
   }
   while (!que.empty()) {
     CopyTask& cur_task = que.front();
     que.pop();
     if (cur_task.path->nodes_[cur_task.step].sync) {
-      cudaStreamSynchronize(cur_task.path->nodes_[cur_task.step].in_stream);
+      sync_stream(cur_task.path->nodes_[cur_task.step].in_stream);
     }
-    if (cur_task.step != cur_task.path->nodes_.size() - 1) {
+    if (static_cast<size_t>(cur_task.step) !=
+        cur_task.path->nodes_.size() - 1) {
       int cur_step = cur_task.step;
       CopyTask c(cur_task.path, cur_step + 1);
       que.push(c);
-      cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
-                      cur_task.path->nodes_[cur_step].key_storage,
-                      cur_task.path->nodes_[cur_step + 1].key_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step + 1].in_stream);
+
+      auto src_dev_id =
+          resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num);
+      auto dst_dev_id =
+          resource_->dev_id(cur_task.path->nodes_[cur_step + 1].dev_num);
+      auto src_place = DevPlace(src_dev_id);
+      auto dst_place = DevPlace(dst_dev_id);
+
+      memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].key_storage,
+                  src_place, cur_task.path->nodes_[cur_step].key_storage,
+                  cur_task.path->nodes_[cur_step + 1].key_bytes_len,
+                  cur_task.path->nodes_[cur_step + 1].in_stream);
       if (need_copy_val) {
-        cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
-                        cur_task.path->nodes_[cur_step].val_storage,
-                        cur_task.path->nodes_[cur_step + 1].val_bytes_len,
-                        cudaMemcpyDefault,
-                        cur_task.path->nodes_[cur_step + 1].in_stream);
+        memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].val_storage,
+                    src_place, cur_task.path->nodes_[cur_step].val_storage,
+                    cur_task.path->nodes_[cur_step + 1].val_bytes_len,
+                    cur_task.path->nodes_[cur_step + 1].in_stream);
       }
     }
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_src(
-    int start_index, int gpu_num, int* h_left, int* h_right, ValType* src_val) {
+void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
+                                                        int num, int* h_left,
+                                                        int* h_right,
+                                                        ValType* src_val) {
   std::queue<CopyTask> que;
-  for (int i = 0; i < gpu_num; i++) {
+
+  for (int i = 0; i < num; i++) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
     int cur_step = path_[start_index][i].nodes_.size() - 1;
     auto& node = path_[start_index][i].nodes_[cur_step];
+
+    auto src_dev_id = resource_->dev_id(i);
+    auto src_place = DevPlace(src_dev_id);
+
     if (cur_step == 0) {
-      cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[i]),
-                      node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
-                      node.out_stream);
+      auto dst_dev_id = resource_->dev_id(start_index);
+      auto dst_place = DevPlace(dst_dev_id);
+      memory_copy(dst_place, reinterpret_cast<char*>(src_val + h_left[i]),
+                  src_place, node.val_storage, node.val_bytes_len,
+                  node.out_stream);
     } else {
       CopyTask t(&path_[start_index][i], cur_step - 1);
       que.push(t);
-      cudaMemcpyAsync(path_[start_index][i].nodes_[cur_step - 1].val_storage,
-                      node.val_storage,
-                      path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      path_[start_index][i].nodes_[cur_step - 1].out_stream);
+
+      auto dst_dev_id =
+          resource_->dev_id(path_[start_index][i].nodes_[cur_step - 1].dev_num);
+      auto dst_place = DevPlace(dst_dev_id);
+
+      memory_copy(dst_place,
+                  path_[start_index][i].nodes_[cur_step - 1].val_storage,
+                  src_place, node.val_storage,
+                  path_[start_index][i].nodes_[cur_step - 1].val_bytes_len,
+                  path_[start_index][i].nodes_[cur_step - 1].out_stream);
     }
   }
+
   while (!que.empty()) {
     CopyTask& cur_task = que.front();
     que.pop();
     int cur_step = cur_task.step;
     if (cur_task.path->nodes_[cur_step].sync) {
-      cudaStreamSynchronize(cur_task.path->nodes_[cur_step].out_stream);
+      sync_stream(cur_task.path->nodes_[cur_step].out_stream);
     }
+
+    auto src_dev_id =
+        resource_->dev_id(cur_task.path->nodes_[cur_step].dev_num);
+    auto src_place = DevPlace(src_dev_id);
+
     if (cur_step > 0) {
       CopyTask c(cur_task.path, cur_step - 1);
       que.push(c);
-      cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
-                      cur_task.path->nodes_[cur_step].val_storage,
-                      cur_task.path->nodes_[cur_step - 1].val_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step - 1].out_stream);
+
+      auto dst_dev_id =
+          resource_->dev_id(cur_task.path->nodes_[cur_step - 1].dev_num);
+      auto dst_place = DevPlace(dst_dev_id);
+
+      memory_copy(dst_place, cur_task.path->nodes_[cur_step - 1].val_storage,
+                  src_place, cur_task.path->nodes_[cur_step].val_storage,
+                  cur_task.path->nodes_[cur_step - 1].val_bytes_len,
+                  cur_task.path->nodes_[cur_step - 1].out_stream);
+
     } else if (cur_step == 0) {
-      int end_index = cur_task.path->nodes_.back().gpu_num;
-      cudaMemcpyAsync(reinterpret_cast<char*>(src_val + h_left[end_index]),
-                      cur_task.path->nodes_[cur_step].val_storage,
-                      cur_task.path->nodes_[cur_step].val_bytes_len,
-                      cudaMemcpyDefault,
-                      cur_task.path->nodes_[cur_step].out_stream);
+      int end_index = cur_task.path->nodes_.back().dev_num;
+
+      auto dst_dev_id = resource_->dev_id(end_index);
+      auto dst_place = DevPlace(dst_dev_id);
+
+      memory_copy(dst_place,
+                  reinterpret_cast<char*>(src_val + h_left[end_index]),
+                  src_place, cur_task.path->nodes_[cur_step].val_storage,
+                  cur_task.path->nodes_[cur_step].val_bytes_len,
+                  cur_task.path->nodes_[cur_step].out_stream);
     }
   }
 }
@@ -314,8 +320,8 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
-  tables_[gpu_num]->show();
+void HeterComm<KeyType, ValType, GradType>::show_one_table(int num) {
+  tables_[num]->show();
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -333,24 +339,22 @@ int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
 }
 
 template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
-                                                     ValType* h_vals,
-                                                     size_t len,
-                                                     size_t chunk_size,
-                                                     int stream_num) {
+void HeterComm<KeyType, ValType, GradType>::build_ps(
+    int dev_num, KeyType* h_keys, ValType* h_vals, size_t len,
+    size_t chunk_size, int stream_num) {
   if (len <= 0) {
     return;
   }
-  int dev_id = resource_->dev_id(num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
+  int dev_id = resource_->dev_id(dev_num);
 
   std::vector<memory::allocation::AllocationPtr> d_key_bufs;
   std::vector<memory::allocation::AllocationPtr> d_val_bufs;
 
-  gpuStream_t streams[stream_num];  // NOLINT
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+  ppStream streams[stream_num];  // NOLINT
   for (int i = 0; i < stream_num; ++i) {
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(streams[i])));
+    create_stream(&(streams[i]));
     auto d_k_buf = memory::Alloc(place, chunk_size * sizeof(KeyType));
     auto d_v_buf = memory::Alloc(place, chunk_size * sizeof(ValType));
     d_key_bufs.push_back(std::move(d_k_buf));
@@ -360,39 +364,48 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
   int cur_len = 0;
   int cur_stream = 0;
 
-  while (cur_len < len) {
+  while (static_cast<size_t>(cur_len) < len) {
     cur_stream = cur_stream % stream_num;
+    auto cur_use_stream = streams[cur_stream];
+#if defined(PADDLE_WITH_XPU_KP)
+    cur_use_stream = 0;
+#endif
+
     int tmp_len = cur_len + chunk_size > len ? len - cur_len : chunk_size;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpyAsync(d_key_bufs[cur_stream]->ptr(), h_keys + cur_len,
-                        sizeof(KeyType) * tmp_len, cudaMemcpyHostToDevice,
-                        streams[cur_stream]));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpyAsync(d_val_bufs[cur_stream]->ptr(), h_vals + cur_len,
-                        sizeof(ValType) * tmp_len, cudaMemcpyHostToDevice,
-                        streams[cur_stream]));
-    tables_[num]->insert(
+
+    auto dst_place = place;
+    auto src_place = platform::CPUPlace();
+
+    memory_copy(
+        dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
+        src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
+    memory_copy(
+        dst_place, reinterpret_cast<char*>(d_val_bufs[cur_stream]->ptr()),
+        src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream);
+
+    tables_[dev_num]->insert(
         reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
         reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()), tmp_len,
-        streams[cur_stream]);
+        cur_use_stream);
+
     cur_stream += 1;
     cur_len += tmp_len;
   }
-
   for (int i = 0; i < stream_num; ++i) {
-    cudaStreamSynchronize(streams[i]);
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams[i]));
+    sync_stream(streams[i]);
+    destroy_stream(streams[i]);
   }
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::merge_grad(
-    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+    int dev_num, KeyType* d_keys, GradType* d_grads, size_t len,
     int& uniq_len) {  // NOLINT
-  int dev_id = resource_->dev_id(gpu_num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  auto stream = resource_->local_stream(gpu_num, 0);
+
+  int dev_id = resource_->dev_id(dev_num);
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(dev_num, 0);
 
   size_t temp_storage_bytes;
 
@@ -403,48 +416,50 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_grads,
-      d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
+  heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys,
+                                 d_merge_keys_ptr, d_grads, d_merge_grads_ptr,
+                                 len, 0, 8 * sizeof(KeyType), stream, false);
 
-  void* d_buff = NULL;
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  heter_comm_kernel_->sort_pairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
-      d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false));
+      d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false);
   temp_storage_bytes = 0;
 
   auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int));
   int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
-      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_merge_grads_ptr,
-      d_grads, d_num_runs_out, merger_, len, stream, false));
+  heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr,
+                                    d_keys, d_merge_grads_ptr, d_grads,
+                                    d_num_runs_out, len, stream, false);
 
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
+  heter_comm_kernel_->reduce_by_key(
       d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
-      d_merge_grads_ptr, d_grads, d_num_runs_out, merger_, len, stream, false));
+      d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false);
 
-  cudaMemcpyAsync(&uniq_len, d_num_runs_out, sizeof(int),
-                  cudaMemcpyDeviceToHost, stream);
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
+  auto dst_place = platform::CPUPlace();
+  auto src_place = place;
+  memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int),
+              stream);
+
+  sync_stream(stream);
 }
 
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
     KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right,
-    int gpu_num) {
-  int total_gpu = resource_->total_gpu();
-  int dev_id = resource_->dev_id(gpu_num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  auto stream = resource_->local_stream(gpu_num, 0);
+    int dev_num) {
+  int total_device = resource_->total_device();
+  int dev_id = resource_->dev_id(dev_num);
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(dev_num, 0);
 
   auto d_idx_tmp = memory::Alloc(place, len * sizeof(int));
   int* d_idx_tmp_ptr = reinterpret_cast<int*>(d_idx_tmp->ptr());
@@ -455,24 +470,28 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
   auto d_shard_index_tmp = memory::Alloc(place, len * sizeof(int));
   int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
 
-  int grid_size = (len - 1) / block_size_ + 1;
-  fill_idx<<<grid_size, block_size_, 0, stream>>>(d_idx_tmp_ptr, len);
-  calc_shard_index<<<grid_size, block_size_, 0, stream>>>(
-      d_keys, len, d_shard_index_tmp_ptr, total_gpu);
+  // int grid_size = (len - 1) / block_size_ + 1;
+
+  heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream);
+  heter_comm_kernel_->calc_shard_index(d_keys, len, d_shard_index_tmp_ptr,
+                                       total_device, stream);
 
   size_t temp_storage_bytes;
-  const int num_bits = 1 + log2i(total_gpu);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+  const int num_bits = 1 + log2i(total_device);
+
+  heter_comm_kernel_->sort_pairs(
       NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr,
-      d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
+      d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream);
 
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+
+  heter_comm_kernel_->sort_pairs(
       d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr,
-      d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream));
-  calc_shard_offset<<<grid_size, block_size_, 0, stream>>>(d_shard_index_ptr,
-                                                           left, right, len);
-  cudaStreamSynchronize(stream);
+      d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream);
+
+  heter_comm_kernel_->calc_shard_offset(d_shard_index_ptr, left, right, len,
+                                        total_device, stream);
+  sync_stream(stream);
 }
 
 template <typename KeyType, typename ValType, typename GradType>
@@ -484,25 +503,43 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
     return;
   }
 
-  int total_gpu = resource_->total_gpu();
+  int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(num, 0);
 
-  int grid_size = (len - 1) / block_size_ + 1;
+  // int grid_size = (len - 1) / block_size_ + 1;
 
-  int h_left[total_gpu];   // NOLINT
-  int h_right[total_gpu];  // NOLINT
+  int h_left[total_device];   // NOLINT
+  int h_right[total_device];  // NOLINT
 
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_left = memory::Alloc(place, total_device * sizeof(int));
+  auto d_right = memory::Alloc(place, total_device * sizeof(int));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
 
-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
-  //
+#if defined(PADDLE_WITH_CUDA)
+  cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  // get XPUDeviceContext according to xpu place
+  paddle::platform::XPUDeviceContext xpu_dev_ctx(place);
+  auto xpu_context = xpu_dev_ctx.x_context();
+
+  int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]", r,
+                        XPUAPIErrorMsg[r]));
+  int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        XPUAPIErrorMsg[r2]));
+#endif
+
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
@@ -513,17 +550,20 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
 
   split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
 
-  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr,
-                                                        d_keys, d_idx_ptr, len);
+  heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_keys, d_idx_ptr, len,
+                                     stream);
 
-  cudaStreamSynchronize(stream);
+  sync_stream(stream);
 
-  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost);
-  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost);
+  auto dst_place = platform::CPUPlace();
+  auto src_place = place;
 
-  for (int i = 0; i < total_gpu; ++i) {
+  memory_copy(dst_place, h_left, src_place, d_left_ptr,
+              total_device * sizeof(int), stream);
+  memory_copy(dst_place, h_right, src_place, d_right_ptr,
+              total_device * sizeof(int), stream);
+
+  for (int i = 0; i < total_device; ++i) {
     int shard_len = h_right[i] - h_left[i] + 1;
     if (shard_len == 0) {
       continue;
@@ -532,47 +572,53 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
                    shard_len * sizeof(ValType));
   }
 
-  walk_to_dest(num, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+  walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
 
-  for (int i = 0; i < total_gpu; ++i) {
+  for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1) {
       continue;
     }
     auto& node = path_[num][i].nodes_.back();
-    cudaStreamSynchronize(node.in_stream);
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    sync_stream(node.in_stream);
+
+    AnyDeviceGuard guard(resource_->dev_id(i));
+
     tables_[i]->rwlock_->RDLock();
     tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
                     reinterpret_cast<ValType*>(node.val_storage),
                     h_right[i] - h_left[i] + 1,
                     resource_->remote_stream(i, num));
   }
-  for (int i = 0; i < total_gpu; ++i) {
-    cudaStreamSynchronize(resource_->remote_stream(i, num));
+
+  for (int i = 0; i < total_device; ++i) {
+    sync_stream(resource_->remote_stream(i, num));
     if (h_left[i] == -1) {
       continue;
     }
     tables_[i]->rwlock_->UNLock();
   }
 
-  walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
+  walk_to_src(num, total_device, h_left, h_right, d_shard_vals_ptr);
 
-  for (int i = 0; i < total_gpu; ++i) {
+  for (int i = 0; i < total_device; ++i) {
     auto& node = path_[num][i].nodes_.front();
-    cudaStreamSynchronize(node.out_stream);
+    sync_stream(node.out_stream);
   }
 
-  fill_dvals<<<grid_size, block_size_, 0, stream>>>(d_shard_vals_ptr, d_vals,
-                                                    d_idx_ptr, len);
-  cudaStreamSynchronize(stream);
-  for (int i = 0; i < total_gpu; ++i) {
+  heter_comm_kernel_->fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len,
+                                 stream);
+
+  sync_stream(stream);
+
+  for (int i = 0; i < total_device; ++i) {
     destroy_storage(num, i);
   }
 }
 
+#if defined(PADDLE_WITH_CUDA)
 template <typename KeyType, typename ValType, typename GradType>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
+void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
                                                         KeyType* d_keys,
                                                         GradType* d_grads,
                                                         size_t len,
@@ -581,23 +627,42 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
     return;
   }
 
-  int total_gpu = resource_->total_gpu();
-  int dev_id = resource_->dev_id(gpu_num);
-  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
-  platform::CUDADeviceGuard guard(dev_id);
-  auto stream = resource_->local_stream(gpu_num, 0);
+  int total_device = resource_->total_device();
+  int dev_id = resource_->dev_id(dev_num);
 
-  int h_left[total_gpu];   // NOLINT
-  int h_right[total_gpu];  // NOLINT
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(dev_num, 0);
 
-  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
-  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int h_left[total_device];   // NOLINT
+  int h_right[total_device];  // NOLINT
+
+  auto d_left = memory::Alloc(place, total_device * sizeof(int));
+  auto d_right = memory::Alloc(place, total_device * sizeof(int));
   int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
   int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
 
-  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
-  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
-  //
+#if defined(PADDLE_WITH_CUDA)
+  cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  // get XPUDeviceContext according to xpu place
+  paddle::platform::XPUDeviceContext xpu_dev_ctx(place);
+  auto xpu_context = xpu_dev_ctx.x_context();
+
+  int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]", r,
+                        XPUAPIErrorMsg[r]));
+  int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        XPUAPIErrorMsg[r2]));
+#endif
+
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
@@ -608,61 +673,183 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int gpu_num,
       reinterpret_cast<GradType*>(d_shard_grads->ptr());
 
   int uniq_len = len;
-  merge_grad(gpu_num, d_keys, d_grads, len, uniq_len);
+  merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
 
-  int grid_size = (uniq_len - 1) / block_size_ + 1;
+  // int grid_size = (uniq_len - 1) / block_size_ + 1;
 
   split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
-                       gpu_num);
+                       dev_num);
 
-  fill_shard_grads<<<grid_size, block_size_, 0, stream>>>(
-      d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr,
-      uniq_len);
+  heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
+                                       d_shard_grads_ptr, d_grads, d_idx_ptr,
+                                       uniq_len, stream);
 
-  cudaStreamSynchronize(stream);
+  sync_stream(stream);
 
-  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost);
-  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
-             cudaMemcpyDeviceToHost);
+  auto dst_place = platform::CPUPlace();
+  auto src_place = place;
+  memory_copy(dst_place, h_left, src_place, d_left_ptr,
+              total_device * sizeof(int), stream);
+  memory_copy(dst_place, h_right, src_place, d_right_ptr,
+              total_device * sizeof(int), stream);
 
-  for (int i = 0; i < total_gpu; ++i) {
+  for (int i = 0; i < total_device; ++i) {
     int shard_len = h_right[i] - h_left[i] + 1;
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    create_storage(gpu_num, i, shard_len * sizeof(KeyType),
+    create_storage(dev_num, i, shard_len * sizeof(KeyType),
                    shard_len * sizeof(GradType));
   }
 
-  walk_to_dest(gpu_num, total_gpu, h_left, h_right, d_shard_keys_ptr,
+  walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
                d_shard_grads_ptr);
 
-  for (int i = 0; i < total_gpu; ++i) {
+  for (int i = 0; i < total_device; ++i) {
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    auto& node = path_[gpu_num][i].nodes_.back();
-    cudaStreamSynchronize(node.in_stream);
+    auto& node = path_[dev_num][i].nodes_.back();
+    sync_stream(node.in_stream);
 
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    AnyDeviceGuard guard(resource_->dev_id(i));
     tables_[i]->rwlock_->WRLock();
     tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
                        reinterpret_cast<GradType*>(node.val_storage),
                        h_right[i] - h_left[i] + 1, sgd,
-                       resource_->remote_stream(i, gpu_num));
+                       resource_->remote_stream(i, dev_num));
   }
-  for (int i = 0; i < total_gpu; ++i) {
-    cudaStreamSynchronize(resource_->remote_stream(i, gpu_num));
+
+  for (int i = 0; i < total_device; ++i) {
+    sync_stream(resource_->remote_stream(i, dev_num));
     if (h_left[i] != -1) {
       tables_[i]->rwlock_->UNLock();
     }
   }
-  for (int i = 0; i < total_gpu; ++i) {
-    destroy_storage(gpu_num, i);
+
+  for (int i = 0; i < total_device; ++i) {
+    destroy_storage(dev_num, i);
   }
 }
 
+#elif defined(PADDLE_WITH_XPU_KP)
+template <typename KeyType, typename ValType, typename GradType>
+void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
+                                                        KeyType* d_keys,
+                                                        GradType* d_grads,
+                                                        size_t len) {
+  if (len == 0) {
+    return;
+  }
+
+  int total_device = resource_->total_device();
+  int dev_id = resource_->dev_id(dev_num);
+
+  DevPlace place = DevPlace(dev_id);
+  AnyDeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(dev_num, 0);
+
+  int h_left[total_device];   // NOLINT
+  int h_right[total_device];  // NOLINT
+
+  auto d_left = memory::Alloc(place, total_device * sizeof(int));
+  auto d_right = memory::Alloc(place, total_device * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+#if defined(PADDLE_WITH_CUDA)
+  cudaMemsetAsync(d_left_ptr, -1, total_device * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_device * sizeof(int), stream);
+
+#elif defined(PADDLE_WITH_XPU_KP)
+  // get XPUDeviceContext according to xpu place
+  paddle::platform::XPUDeviceContext xpu_dev_ctx(place);
+  auto xpu_context = xpu_dev_ctx.x_context();
+
+  int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]", r,
+                        XPUAPIErrorMsg[r]));
+  int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
+  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+                    platform::errors::External(
+                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        XPUAPIErrorMsg[r2]));
+#endif
+
+  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
+  KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
+  auto d_shard_grads = memory::Alloc(place, len * sizeof(GradType));
+  GradType* d_shard_grads_ptr =
+      reinterpret_cast<GradType*>(d_shard_grads->ptr());
+
+  int uniq_len = len;
+  merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
+
+  // int grid_size = (uniq_len - 1) / block_size_ + 1;
+
+  split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
+                       dev_num);
+
+  heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
+                                       d_shard_grads_ptr, d_grads, d_idx_ptr,
+                                       (long long)uniq_len, stream);
+
+  sync_stream(stream);
+
+  auto dst_place = platform::CPUPlace();
+  auto src_place = place;
+  memory_copy(dst_place, h_left, src_place, d_left_ptr,
+              total_device * sizeof(int));
+  memory_copy(dst_place, h_right, src_place, d_right_ptr,
+              total_device * sizeof(int));
+
+  for (int i = 0; i < total_device; ++i) {
+    int shard_len = h_right[i] - h_left[i] + 1;
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    create_storage(dev_num, i, shard_len * sizeof(KeyType),
+                   shard_len * sizeof(GradType));
+  }
+
+  walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
+               d_shard_grads_ptr);
+
+  for (int i = 0; i < total_device; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto& node = path_[dev_num][i].nodes_.back();
+    sync_stream(node.in_stream);
+
+    AnyDeviceGuard guard(resource_->dev_id(i));
+    tables_[i]->rwlock_->WRLock();
+    tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
+                       reinterpret_cast<GradType*>(node.val_storage),
+                       h_right[i] - h_left[i] + 1,
+                       resource_->remote_stream(i, dev_num));
+  }
+
+  for (int i = 0; i < total_device; ++i) {
+    sync_stream(resource_->remote_stream(i, dev_num));
+    if (h_left[i] != -1) {
+      tables_[i]->rwlock_->UNLock();
+    }
+  }
+
+  for (int i = 0; i < total_device; ++i) {
+    destroy_storage(dev_num, i);
+  }
+}
+
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
 template <typename KeyType, typename ValType, typename GradType>
 template <typename Sgd>
 void HeterComm<KeyType, ValType, GradType>::update_one_table(
@@ -705,7 +892,7 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
 template <typename KeyType, typename ValType, typename GradType>
 int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
     int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
-  int total_gpu = resource_->total_gpu();
+  int total_gpu = resource_->total_device();
   int dev_id = resource_->dev_id(gpu_num);
   auto& storage = storage_[gpu_num];
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
@@ -725,10 +912,10 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
 
   // allgather grad len
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::ncclAllGather((const void*)(d_node_len + gpu_num),
-                                       (void*)d_node_len, 1, ncclInt,  // NOLINT
-                                       nccl_inner_comm, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
+      (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1,  // NOLINT
+      ncclInt,                                                    // NOLINT
+      nccl_inner_comm, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu,
@@ -775,11 +962,12 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
     cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
                cudaMemcpyDeviceToHost);
 
-    int grid_size = (h_node_len[i] - 1) / block_size_ + 1;
-    fill_shard_grads<<<grid_size, block_size_, 0, stream>>>(
+    // int grid_size = (h_node_len[i] - 1) / block_size_ + 1;
+    heter_comm_kernel_->fill_shard_grads(
         storage.local_keys + merge_num, storage.all_keys + index,
         storage.local_grads + merge_num, storage.all_grads + index,
-        d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1);
+        d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1,
+        stream);
     merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1;
   }
 
@@ -848,19 +1036,21 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   return ret;
 }
 
+#endif
+
 template <typename KeyType, typename ValType, typename GradType>
 void HeterComm<KeyType, ValType, GradType>::end_pass() {
-  int total_gpu = resource_->total_gpu();
+  int total_device = resource_->total_device();
   std::vector<std::thread> threads;
 
   auto dump_to_cpu_func = [this](int index) {
     auto stream = resource_->local_stream(index, 0);
     int dev_id = resource_->dev_id(index);
-    platform::CUDADeviceGuard guard(dev_id);
+    AnyDeviceGuard guard(dev_id);
     tables_[index]->dump_to_cpu(dev_id, stream);
   };
 
-  for (int i = 0; i < total_gpu; ++i) {
+  for (int i = 0; i < total_device; ++i) {
     threads.push_back(std::thread(dump_to_cpu_func, i));
   }
   for (auto& t : threads) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
new file mode 100644
index 0000000000000..694bdb8d563f5
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -0,0 +1,269 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+
+namespace paddle {
+namespace framework {
+
+#ifdef PADDLE_WITH_CUDA
+
+struct GPUCustomGradMerger {
+  template <typename T>
+  CUB_RUNTIME_FUNCTION __forceinline__ __device__ T
+  operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+    for (int i = 0; i < MF_DIM; ++i) {
+      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
+    }
+    return out;
+  }
+} gpu_merger;
+
+template <typename T>
+__global__ void fill_idx_kernel(T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    idx[i] = i;
+  }
+}
+
+// template <typename T>
+// void show_tensor(T* input, size_t len, gpuStream_t stream, std::string
+// name)
+// {
+//  T tmp[len];  // NOLINT
+//  cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost,
+//  stream);
+//  cudaStreamSynchronize(stream);
+//  std::cout << name;
+//  for (int i = 0; i < len; ++i) {
+//    std::cout << ":" << tmp[i];
+//  }
+//  std::cout << std::endl;
+//}
+
+template <typename T>
+__global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
+                                         size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len - 1) {
+    if (idx[i] != idx[i + 1]) {
+      right[idx[i]] = i;
+      left[idx[i + 1]] = i + 1;
+    }
+  }
+  if (i == 0) {
+    left[idx[i]] = i;
+  }
+  if (i == (len - 1)) {
+    right[idx[i]] = i;
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void calc_shard_index_kernel(KeyType* d_keys, size_t len,
+                                        T* shard_index, int total_gpu) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    shard_index[i] = d_keys[i] % total_gpu;
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                      T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_shard_keys[i] = d_keys[idx[i]];
+  }
+}
+
+template <typename KeyType, typename GradType, typename T>
+__global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                        GradType* d_shard_grads,
+                                        GradType* d_grads, T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_shard_keys[i] = d_keys[idx[i]];
+    d_shard_grads[i] = d_grads[idx[i]];
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
+                                  T* idx, size_t len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_vals[idx[i]] = d_shard_vals[i];
+  }
+}
+
+// cuda implemention of  heter_comm_kernel.h
+template <typename T, typename StreamType>
+void HeterCommKernel::fill_idx(T* idx, long long len,
+                               const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_idx_kernel<<<grid_size, block_size_, 0, stream>>>(idx, c_len);
+}
+
+template <typename T, typename StreamType>
+void HeterCommKernel::calc_shard_offset(T* idx, T* left, T* right,
+                                        long long len, int total_devs,
+                                        const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  calc_shard_offset_kernel<<<grid_size, block_size_, 0, stream>>>(idx, left,
+                                                                  right, c_len);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void HeterCommKernel::calc_shard_index(KeyType* d_keys, long long len,
+                                       T* shard_index, int total_gpu,
+                                       const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  calc_shard_index_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_keys, c_len, shard_index, total_gpu);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void HeterCommKernel::fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys,
+                                     T* idx, long long len,
+                                     const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_shard_key_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, idx, c_len);
+}
+
+template <typename KeyType, typename GradType, typename T, typename StreamType>
+void HeterCommKernel::fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                                       GradType* d_shard_grads,
+                                       GradType* d_grads, T* idx, long long len,
+                                       const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_keys, d_keys, d_shard_grads, d_grads, idx, c_len);
+}
+
+template <typename ValType, typename T, typename StreamType>
+void HeterCommKernel::fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx,
+                                 long long len, const StreamType& stream) {
+  int grid_size = (len - 1) / block_size_ + 1;
+  size_t c_len = (size_t)len;
+  fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(d_shard_vals, d_vals,
+                                                           idx, c_len);
+}
+
+template <typename KeyT, typename ValueT, typename StreamType>
+void HeterCommKernel::sort_pairs(void* d_temp_storage,
+                                 size_t& temp_storage_bytes,  // NOLINT
+                                 const KeyT* d_keys_in,       // NOLINT
+                                 KeyT* d_keys_out, const ValueT* d_values_in,
+                                 ValueT* d_values_out, int num_items,
+                                 int begin_bit, int end_bit, StreamType stream,
+                                 bool debug_synchronous) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in,
+      d_values_out, num_items, begin_bit, end_bit, stream, debug_synchronous));
+}
+
+template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
+          typename ValuesInputIteratorT, typename AggregatesOutputIteratorT,
+          typename NumRunsOutputIteratorT, typename StreamType>
+void HeterCommKernel::reduce_by_key(void* d_temp_storage,
+                                    size_t& temp_storage_bytes,  // NOLINT
+                                    KeysInputIteratorT d_keys_in,
+                                    UniqueOutputIteratorT d_unique_out,
+                                    ValuesInputIteratorT d_values_in,
+                                    AggregatesOutputIteratorT d_aggregates_out,
+                                    NumRunsOutputIteratorT d_num_runs_out,
+                                    int num_items, StreamType stream,
+                                    bool debug_synchronous) {
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::ReduceByKey(
+      d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in,
+      d_aggregates_out, d_num_runs_out, gpu_merger, num_items, stream,
+      debug_synchronous));
+}
+
+template void HeterCommKernel::fill_idx<int, cudaStream_t>(
+    int* idx, long long len, const cudaStream_t& stream);
+
+template void HeterCommKernel::calc_shard_offset<int, cudaStream_t>(
+    int* idx, int* left, int* right, long long len, int total_devs,
+    const cudaStream_t& stream);
+template void HeterCommKernel::calc_shard_index<
+    unsigned long, int, cudaStream_t>(unsigned long* d_keys, long long len,
+                                      int* shard_index, int total_devs,
+                                      const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
+    unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::fill_shard_grads<
+    unsigned long, paddle::framework::FeaturePushValue, int, cudaStream_t>(
+    unsigned long* d_shard_keys, unsigned long* d_keys,
+    paddle::framework::FeaturePushValue* d_shard_grads,
+    paddle::framework::FeaturePushValue* d_grads, int* idx, long long len,
+    const cudaStream_t& stream);
+
+template void
+HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>(
+    paddle::framework::FeatureValue* d_shard_vals,
+    paddle::framework::FeatureValue* d_vals, int* idx, long long len,
+    const cudaStream_t& stream);
+
+template void HeterCommKernel::sort_pairs<
+    unsigned long, paddle::framework::FeaturePushValue, cudaStream_t>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,      // NOLINT
+    const unsigned long* d_keys_in,  // NOLINT
+    unsigned long* d_keys_out,
+    const paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_values_out, int num_items,
+    int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous);
+
+template void HeterCommKernel::sort_pairs<int, int, cudaStream_t>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    const int* d_keys_in,        // NOLINT
+    int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items,
+    int begin_bit, int end_bit, cudaStream_t stream, bool debug_synchronous);
+
+template void HeterCommKernel::reduce_by_key<
+    unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*,
+    paddle::framework::FeaturePushValue*, int*, cudaStream_t>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    unsigned long* d_keys_in, unsigned long* d_unique_out,
+    paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_aggregates_out, int* d_num_runs_out,
+    int num_items, cudaStream_t stream, bool debug_synchronous);
+
+#endif
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
new file mode 100644
index 0000000000000..1be3687a7dbee
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+
+#if defined(PADDLE_WITH_CUDA)
+#include "cub/cub.cuh"
+#include "cub/util_allocator.cuh"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/enforce.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+class HeterCommKernel {
+ public:
+  HeterCommKernel() {}
+  explicit HeterCommKernel(const int block_size) : block_size_(block_size) {}
+
+  template <typename T, typename StreamType>
+  void fill_idx(T* idx, long long len, const StreamType& stream);
+
+  template <typename T, typename StreamType>
+  void calc_shard_offset(T* idx, T* left, T* right, long long len,
+                         int total_devs, const StreamType& stream);
+
+  template <typename KeyType, typename T, typename StreamType>
+  void calc_shard_index(KeyType* d_keys, long long len, T* shard_index,
+                        int total_devs, const StreamType& stream);
+
+  template <typename KeyType, typename T, typename StreamType>
+  void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx,
+                      long long len, const StreamType& stream);
+
+  template <typename KeyType, typename GradType, typename T,
+            typename StreamType>
+  void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                        GradType* d_shard_grads, GradType* d_grads, T* idx,
+                        long long len, const StreamType& stream);
+
+  template <typename ValType, typename T, typename StreamType>
+  void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, long long len,
+                  const StreamType& stream);
+
+  template <typename KeyT, typename ValueT, typename StreamType>
+  void sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes,  // NOLINT
+                  const KeyT* d_keys_in, KeyT* d_keys_out,
+                  const ValueT* d_values_in, ValueT* d_values_out,
+                  int num_items, int begin_bit = 0,
+                  int end_bit = sizeof(KeyT) * 8, StreamType stream = NULL,
+                  bool debug_synchronous = false);
+
+  template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
+            typename ValuesInputIteratorT, typename AggregatesOutputIteratorT,
+            typename NumRunsOutputIteratorT, typename StreamType>
+  void reduce_by_key(void* d_temp_storage,
+                     size_t& temp_storage_bytes,  // NOLINT
+                     KeysInputIteratorT d_keys_in,
+                     UniqueOutputIteratorT d_unique_out,
+                     ValuesInputIteratorT d_values_in,
+                     AggregatesOutputIteratorT d_aggregates_out,
+                     NumRunsOutputIteratorT d_num_runs_out, int num_items,
+                     StreamType stream = NULL, bool debug_synchronous = false);
+
+ private:
+  int block_size_{256};
+};
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
new file mode 100644
index 0000000000000..c3e37d9eba34d
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.kps
@@ -0,0 +1,351 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h"
+
+#if defined(PADDLE_WITH_XPU_KP)
+#include <xpu/runtime.h>
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/math.h"
+#include "xpu/kernel/simd.h"
+#endif
+
+namespace paddle {
+namespace framework {
+
+#if defined(PADDLE_WITH_XPU_KP)
+
+struct XPUCustomGradMerger {
+  template <typename T>
+  __device__ T operator()(const T& a, const T& b) const {
+    T out;
+    out.slot = a.slot;
+    out.show = a.show + b.show;
+    out.clk = a.clk + b.clk;
+    out.lr_g = a.lr_g + b.lr_g;
+    for (int i = 0; i < MF_DIM; ++i) {
+      out.mf_g[i] = a.mf_g[i] + b.mf_g[i];
+    }
+    return out;
+  }
+} xpu_merger;
+
+template <typename T>
+__global__ void fill_idx_kernel(T* idx, long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 1024;
+  __local__ T local_idx[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    int read_len = min(len_per_loop, len - i);
+    for (int k = 0; k < read_len; k++) {
+      int real_idx = i + k;
+      local_idx[k] = real_idx;
+    }
+    LM2GM(local_idx, idx + i, read_len * sizeof(T));
+  }
+}
+
+template <typename T>
+__global__ void calc_shard_offset_kernel(T* idx, T* left, T* right,
+                                         long long len, const int total_xpu) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 1024;
+  __local__ T local_idx[buf_size];
+  __local__ T local_left[total_xpu];
+  __local__ T local_right[total_xpu];
+
+  for (int i = 0; i < total_xpu; i++) {
+    local_left[i] = -1;
+    local_right[i] = -1;
+  }
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    for (int k = 0; k < read_len; k++) {
+      if (local_idx[k] != local_idx[k + 1]) {
+        int real_idx = i + k;
+        local_right[local_idx[k]] = real_idx;
+        local_left[local_idx[k + 1]] = real_idx + 1;
+      }
+    }
+    if (i == 0) {
+      local_left[local_idx[i]] = i;
+    }
+    if (i + read_len == len) {
+      local_right[local_idx[len - 1]] = len - 1;
+    }
+  }
+  // to be optimized: call LM2GM too frequently
+  // all_reduce between threads to get global left & global right && LM2GM
+  for (int i = 0; i < total_xpu; i++) {
+    if (local_left[i] != -1) LM2GM(local_left + i, left + i, sizeof(T));
+    if (local_right[i] != -1) LM2GM(local_right + i, right + i, sizeof(T));
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void calc_shard_index_kernel(KeyType* d_keys, long long len,
+                                        T* shard_index, int total_xpu) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 512;
+  __local__ KeyType local_keys[buf_size];
+  __local__ T local_shard_index[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    for (int k = 0; k < read_len; k++) {
+      local_shard_index[k] = local_keys[k] % total_xpu;
+    }
+    LM2GM(local_shard_index, shard_index + i, read_len * sizeof(T));
+  }
+}
+
+template <typename KeyType, typename T>
+__global__ void fill_shard_key_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                      T* idx, long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 400;
+  __local__ KeyType local_keys[buf_size];
+  __local__ KeyType local_shard_keys[buf_size];
+  __local__ T local_idx[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    for (int k = 0; k < read_len; k++) {
+      local_shard_keys[k] = local_keys[local_idx[k]];
+    }
+    LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
+  }
+}
+
+// local mem too large, cause compile error
+template <typename KeyType, typename GradType, typename T>
+__global__ void fill_shard_grads_kernel(KeyType* d_shard_keys, KeyType* d_keys,
+                                        GradType* d_shard_grads,
+                                        GradType* d_grads, T* idx,
+                                        long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+
+  const int buf_size = 100;
+  __local__ KeyType local_keys[buf_size];
+  __local__ GradType local_grads[buf_size];
+  __local__ KeyType local_shard_keys[buf_size];
+  __local__ GradType local_shard_grads[buf_size];
+  __local__ T local_idx[buf_size];
+
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(d_keys + i, local_keys, read_len * sizeof(KeyType));
+    GM2LM(d_grads + i, local_grads, read_len * sizeof(GradType));
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    for (int k = 0; k < read_len; k++) {
+      local_shard_keys[k] = local_keys[local_idx[k]];
+      local_shard_grads[k] = local_grads[local_idx[k]];
+    }
+    LM2GM(local_shard_keys, d_shard_keys + i, read_len * sizeof(KeyType));
+    LM2GM(local_shard_grads, d_shard_grads + i, read_len * sizeof(GradType));
+  }
+}
+
+template <typename ValType, typename T>
+__global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
+                                  T* idx, long long len) {
+  int cid = core_id();
+  int ncores = core_num();
+  if (cid >= ncores) {
+    return;
+  }
+  int thread_id = ncores * cluster_id() + cid;
+  int nthreads = ncores * cluster_num();
+  const int buf_size = 50;
+  __local__ ValType local_vals[buf_size];
+  __local__ ValType local_shard_vals[buf_size];
+  __local__ T local_idx[buf_size];
+  int len_per_loop = min(buf_size, roundup_div(len, nthreads));
+  for (int i = thread_id * len_per_loop; i < len;
+       i += nthreads * len_per_loop) {
+    // read batch from GM will boost performance
+    int read_len = min(len_per_loop, len - i);
+    GM2LM(idx + i, local_idx, read_len * sizeof(T));
+    GM2LM(d_shard_vals + i, local_shard_vals, read_len * sizeof(ValType));
+    for (int k = 0; k < read_len; k++) {
+      local_vals[local_idx[k]] = local_shard_vals[k];
+    }
+    LM2GM(local_vals, d_vals + i, read_len * sizeof(ValType));
+  }
+}
+
+// xpu implementation of heter_comm_kernel.h
+
+template <typename T, typename StreamType>
+void fill_idx(T* idx, long long len, const StreamType& stream) {
+  fill_idx_kernel<T><<<4, 64, stream>>>(idx, len);
+}
+
+template <typename T, typename StreamType>
+void calc_shard_offset(T* idx, T* left, T* right, long long len, int total_devs,
+                       const StreamType& stream) {
+  calc_shard_offset_kernel<T><<<4, 64, stream>>>(idx, left, right, len,
+                                                 total_devs);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void calc_shard_index(KeyType* d_keys, long long len, T* shard_index,
+                      int total_devs, const StreamType& stream) {
+  calc_shard_index_kernel<KeyType, T><<<4, 64, stream>>>(
+      d_keys, len, shard_index, total_devs);
+}
+
+template <typename KeyType, typename T, typename StreamType>
+void fill_shard_key(KeyType* d_shard_keys, KeyType* d_keys, T* idx,
+                    long long len, const StreamType& stream) {
+  fill_shard_key_kernel<KeyType, T><<<4, 64, stream>>>(d_shard_keys, d_keys,
+                                                       idx, len);
+}
+
+template <typename KeyType, typename GradType, typename T, typename StreamType>
+void fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
+                      GradType* d_shard_grads, GradType* d_grads, T* idx,
+                      long long len, const StreamType& stream) {
+  fill_shard_grads_kernel<KeyType, GradType, T><<<4, 64, stream>>>(
+      d_shard_keys, d_keys, d_shard_grads, d_grads, idx, len);
+}
+
+template <typename ValType, typename T, typename StreamType>
+void fill_dvals(ValType* d_shard_vals, ValType* d_vals, T* idx, long long len,
+                const StreamType& stream) {
+  fill_dvals_kernel<ValType, T><<<4, 64, stream>>>(d_shard_vals, d_vals, idx,
+                                                   len);
+}
+
+template <typename KeyT, typename ValueT, typename StreamType>
+void sort_pairs(void* d_temp_storage, size_t& temp_storage_bytes,  // NOLINT
+                const KeyT* d_keys_in,                             // NOLINT
+                KeyT* d_keys_out, const ValueT* d_values_in,
+                ValueT* d_values_out, int num_items, int begin_bit, int end_bit,
+                StreamType stream, bool debug_synchronous) {}
+
+template <typename KeysInputIteratorT, typename UniqueOutputIteratorT,
+          void reduce_by_key(
+              void* d_temp_storage,
+              size_t& temp_storage_bytes,  // NOLINT
+              KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out,
+              ValuesInputIteratorT d_values_in,
+              AggregatesOutputIteratorT d_aggregates_out,
+              NumRunsOutputIteratorT d_num_runs_out, int num_items,
+              StreamType stream, bool debug_synchronous) {}
+
+template void fill_idx<int, XPUStream>(int* idx, long long len,
+                                       const XPUStream& stream);
+template void calc_shard_offset<int, XPUStream>(int* idx, int* left, int* right,
+                                                long long len, int total_devs,
+                                                const XPUStream& stream);
+template void calc_shard_index<unsigned long, int, XPUStream>(
+    unsigned long* d_keys, long long len, int* shard_index, int total_devs,
+    const XPUStream& stream);
+
+template void fill_shard_key<unsigned long, int, XPUStream>(
+    unsigned long* d_shard_keys, unsigned long* d_keys, int* idx, long long len,
+    const XPUStream& stream);
+template void
+fill_shard_grads<unsigned long, paddle::framework::FeaturePushValue, int,
+                 XPUStream>(unsigned long* d_shard_keys, unsigned long* d_keys,
+                            paddle::framework::FeaturePushValue* d_shard_grads,
+                            paddle::framework::FeaturePushValue* d_grads,
+                            int* idx, long long len, const XPUStream& stream);
+template void fill_dvals<paddle::framework::FeatureValue, int, XPUStream>(
+    paddle::framework::FeatureValue* d_shard_vals,
+    paddle::framework::FeatureValue* d_vals, int* idx, long long len,
+    const XPUStream& stream);
+
+template void
+sort_pairs<unsigned long, paddle::framework::FeaturePushValue, XPUStream>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,      // NOLINT
+    const unsigned long* d_keys_in,  // NOLINT
+    unsigned long* d_keys_out,
+    const paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_values_out, int num_items,
+    int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous);
+
+template void sort_pairs<int, int, XPUStream>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    const int* d_keys_in,        // NOLINT
+    int* d_keys_out, const int* d_values_in, int* d_values_out, int num_items,
+    int begin_bit, int end_bit, XPUStream stream, bool debug_synchronous);
+
+template void reduce_by_key<
+    unsigned long*, unsigned long*, paddle::framework::FeaturePushValue*,
+    paddle::framework::FeaturePushValue*, int*, XPUStream>(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,  // NOLINT
+    unsigned long* d_keys_in, unsigned long* d_unique_out,
+    paddle::framework::FeaturePushValue* d_values_in,
+    paddle::framework::FeaturePushValue* d_aggregates_out,
+    int* d_num_runs_out int num_items, XPUStream stream,
+    bool debug_synchronous);
+
+#endif
+
+}  // end namespace framework
+}  // end namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 581b0d511c23e..583eb926a26a5 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -29,7 +29,9 @@ HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) {
   comm_ =
       std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>(
           capacity, resource);
+#if defined(PADDLE_WITH_CUDA)
   opt_ = Optimizer<FeatureValue, FeaturePushValue>();
+#endif
 }
 
 HeterPs::~HeterPs() {}
@@ -54,15 +56,21 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
 void HeterPs::push_sparse(int num, FeatureKey* d_keys,
                           FeaturePushValue* d_grads, size_t len) {
+#if defined(PADDLE_WITH_CUDA)
   comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+#elif defined(PADDLE_WITH_XPU_KP)
+  comm_->push_sparse(num, d_keys, d_grads, len);
+#endif
   // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
 
+#if defined(PADDLE_WITH_CUDA)
 void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
                                      const std::vector<ncclComm_t>& inter_comms,
                                      int comm_size) {
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
+#endif
 
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index d78b6b492074d..7fb50f4da1fce 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -16,7 +16,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
+#if defined(PADDLE_WITH_CUDA)
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#endif
 
 #ifdef PADDLE_WITH_HETERPS
 
@@ -35,9 +37,13 @@ class HeterPs : public HeterPsBase {
                            size_t len) override;
   virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
                         size_t len, size_t chunk_size, int stream_num) override;
+
+#if defined(PADDLE_WITH_CUDA)
   virtual void set_nccl_comm_and_size(
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) override;
+#endif
+
   virtual void end_pass() override;
   virtual int get_index_by_devid(int devid) override;
   virtual void show_one_table(int gpu_num) override;
@@ -46,7 +52,9 @@ class HeterPs : public HeterPsBase {
 
  private:
   std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_;
+#if defined(PADDLE_WITH_CUDA)
   Optimizer<FeatureValue, FeaturePushValue> opt_;
+#endif
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 05b3ecf9c3c12..ddbf02df6c578 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -35,9 +35,11 @@ class HeterPsBase {
   virtual void build_ps(int num, FeatureKey* h_keys, FeatureValue* h_vals,
                         size_t len, size_t chunk_size, int stream_num) = 0;
   virtual int get_index_by_devid(int devid) = 0;
+#if defined(PADDLE_WITH_CUDA)
   virtual void set_nccl_comm_and_size(
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) = 0;
+#endif
   virtual void end_pass() = 0;
   virtual void show_one_table(int gpu_num) = 0;
   virtual void push_sparse(int num, FeatureKey* d_keys,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index cad7559af5742..7074cfb521bdf 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -13,12 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
-#include "heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+#include "paddle/fluid/platform/device/xpu/enforce_xpu.h"
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 GPUResource::GPUResource(std::vector<int>& dev_ids, int index) {
   index_ = index;
   dev_ids_ = dev_ids;
@@ -52,7 +61,41 @@ GPUResource::~GPUResource() {
   }
 }
 
+#elif defined(PADDLE_WITH_XPU_KP)
+XPUResource::XPUResource(std::vector<int>& dev_ids, int index) {
+  index_ = index;
+  dev_ids_ = dev_ids;
+  dev_id_ = dev_ids_[index];
+
+  platform::XPUDeviceGuard guard(dev_id_);
+  local_streams_.resize(dev_ids_.size());
+  comm_streams_.resize(dev_ids_.size(), NULL);
+  remote_streams_.resize(dev_ids_.size());
+
+  for (size_t i = 0; i < dev_ids_.size(); ++i) {
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&local_streams_[i]));
+    // PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&comm_streams_[i]));
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&remote_streams_[i]));
+  }
+}
+
+XPUResource::~XPUResource() {
+  platform::XPUDeviceGuard guard(dev_id_);
+  for (size_t i = 0; i < local_streams_.size(); ++i) {
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(local_streams_[i]));
+  }
+  // for (size_t i = 0; i < comm_streams_.size(); ++i) {
+  //  PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(comm_streams_[i]));
+  // }
+  for (size_t i = 0; i < remote_streams_.size(); ++i) {
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_destroy(remote_streams_[i]));
+  }
+}
+
+#endif
+
 void HeterPsResource::enable_p2p() {
+#if defined(PADDLE_WITH_CUDA)
   for (size_t i = 0; i < dev_ids_.size(); ++i) {
     platform::CUDADeviceGuard guard(dev_ids_[i]);
     for (size_t j = 0; j < dev_ids_.size(); ++j) {
@@ -72,28 +115,28 @@ void HeterPsResource::enable_p2p() {
       }
     }
   }
+#endif
 }
 
 HeterPsResource::HeterPsResource(const std::vector<int>& dev_ids) {
   dev_ids_ = dev_ids;
   for (size_t i = 0; i < dev_ids_.size(); ++i) {
-    std::shared_ptr<GPUResource> resource =
-        std::make_shared<GPUResource>(dev_ids_, i);
+    std::shared_ptr<DevResource> resource =
+        std::make_shared<DevResource>(dev_ids_, i);
     resources_.push_back(resource);
     devid_2_index_[dev_ids_[i]] = i;
   }
 }
 
-cudaStream_t HeterPsResource::comm_stream(int gpu_num, int stream_num) {
-  return resources_[gpu_num]->comm_stream(stream_num);
+ppStream HeterPsResource::comm_stream(int dev_num, int stream_num) {
+  return resources_[dev_num]->comm_stream(stream_num);
 }
-
-cudaStream_t HeterPsResource::local_stream(int gpu_num, int stream_num) {
-  return resources_[gpu_num]->local_stream(stream_num);
+ppStream HeterPsResource::local_stream(int dev_num, int stream_num) {
+  return resources_[dev_num]->local_stream(stream_num);
 }
 
-cudaStream_t HeterPsResource::remote_stream(int gpu_num, int stream_num) {
-  return resources_[gpu_num]->remote_stream(stream_num);
+ppStream HeterPsResource::remote_stream(int dev_num, int stream_num) {
+  return resources_[dev_num]->remote_stream(stream_num);
 }
 
 int HeterPsResource::dev_id(int num) { return dev_ids_[num]; }
@@ -102,7 +145,7 @@ int HeterPsResource::get_index_by_devid(int devid) {
   return devid_2_index_[devid];
 }
 
-int HeterPsResource::total_gpu() { return dev_ids_.size(); }
+int HeterPsResource::total_device() { return dev_ids_.size(); }
 
 void HeterPsResource::set_multi_mf(int multi_mf_dim, int max_mf_dim) {
   multi_mf_dim_ = multi_mf_dim;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 19df8cc70f50e..164fca2276800 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -17,7 +17,16 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <vector>
+
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#ifdef PADDLE_WITH_XPU_KP
+#include <xpu/runtime.h>  // NOLINT
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif
+
 #include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_HETERPS
@@ -25,9 +34,16 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
+using ppStream = cudaStream_t;
+#elif defined(PADDLE_WITH_XPU_KP)
+using ppStream = XPUStream;
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
 class GPUResource {
  public:
-  GPUResource(std::vector<int>& device_id, int index);
+  GPUResource(std::vector<int>& device_id, int index);  // NOLINT
   virtual ~GPUResource();
   GPUResource(const GPUResource&) = delete;
   GPUResource& operator=(const GPUResource&) = delete;
@@ -45,23 +61,55 @@ class GPUResource {
   std::vector<gpuStream_t> local_streams_;
   std::vector<gpuStream_t> comm_streams_;
 };
+#elif defined(PADDLE_WITH_XPU_KP)
+class XPUResource {
+ public:
+  XPUResource(std::vector<int>& device_id, int index);  // NOLINT
+  virtual ~XPUResource();
+  XPUResource(const XPUResource&) = delete;
+  XPUResource& operator=(const XPUResource&) = delete;
+
+  int dev_id() const { return dev_id_; }
+  int index() const { return index_; }
+  XPUStream local_stream(int num) { return local_streams_[num]; }
+  XPUStream remote_stream(int num) { return remote_streams_[num]; }
+  XPUStream comm_stream(int num) { return comm_streams_[num]; }
+
+  int dev_id_;
+  int index_;
+  std::vector<int> dev_ids_;
+  std::vector<XPUStream> remote_streams_;
+  std::vector<XPUStream> local_streams_;
+  std::vector<XPUStream> comm_streams_;
+};
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+using DevResource = GPUResource;
+using DevPlace = platform::CUDAPlace;
+using AnyDeviceGuard = platform::CUDADeviceGuard;
+#elif defined(PADDLE_WITH_XPU_KP)
+using DevResource = XPUResource;
+using DevPlace = platform::XPUPlace;
+using AnyDeviceGuard = platform::XPUDeviceGuard;
+#endif
 
 class HeterPsResource {
  public:
-  HeterPsResource(const std::vector<int>& dev_ids);
+  explicit HeterPsResource(const std::vector<int>& dev_ids);
   HeterPsResource(const HeterPsResource&) = delete;
   HeterPsResource& operator=(const HeterPsResource&) = delete;
   virtual ~HeterPsResource() {}
   void enable_p2p();
-  int total_gpu();
+  int total_device();
   int get_index_by_devid(int devid);
   int dev_id(int num);
   void set_multi_mf(int multi_mf_dim, int max_mf_dim);
-  gpuStream_t local_stream(int gpu_num, int stream_num);
-  gpuStream_t remote_stream(int gpu_num, int stream_num);
-  gpuStream_t comm_stream(int gpu_num, int stream_num);
+  ppStream local_stream(int dev_num, int stream_num);
+  ppStream remote_stream(int dev_num, int stream_num);
+  ppStream comm_stream(int dev_num, int stream_num);
 
-  std::vector<std::shared_ptr<GPUResource>> resources_;
+  std::vector<std::shared_ptr<DevResource>> resources_;
   std::vector<int> dev_ids_;
   std::map<int, int> devid_2_index_;
   int multi_mf_dim_{0};
diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
index 9189902c28ffb..a663d1bf76410 100644
--- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
+++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h
@@ -18,6 +18,7 @@ limitations under the License. */
 // #include
 // "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
 #include <iostream>
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh"
 
 namespace paddle {
@@ -111,3 +112,4 @@ class HBMMemoryPool : public managed {
 }  // end namespace framework
 }  // end namespace paddle
 #endif
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index ff9976db5d875..ebf7dd277c7d6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -13,16 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_HETERPS
+
+#if defined(PADDLE_WITH_CUDA)
 #include <curand_kernel.h>
+#endif
 #include <vector>
-#include "optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-
-#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
 
 namespace paddle {
 namespace framework {
 
+#if defined(PADDLE_WITH_CUDA)
 template <typename ValType, typename GradType>
 class Optimizer {
  public:
@@ -32,7 +35,8 @@ class Optimizer {
 
   void initialize() {}
 
-  __device__ void update_lr(float& w, float& g2sum, float g, float scale) {
+  __device__ void update_lr(float& w, float& g2sum, float g,  // NOLINT
+                            float scale) {
     double add_g2sum = 0;
     double ratio = optimizer_config::learning_rate *
                    sqrt(optimizer_config::initial_g2sum /
@@ -49,8 +53,8 @@ class Optimizer {
     g2sum += add_g2sum;
   }
 
-  __device__ void update_mf(int n, float* w, float& g2sum, const float* g,
-                            float scale) {
+  __device__ void update_mf(int n, float* w, float& g2sum,  // NOLINT
+                            const float* g, float scale) {
     double add_g2sum = 0;
     double ratio = optimizer_config::mf_learning_rate *
                    sqrt(optimizer_config::mf_initial_g2sum /
@@ -69,7 +73,8 @@ class Optimizer {
 
     g2sum += add_g2sum / n;
   }
-  __device__ void update_value(ValType& val, const GradType& grad) {
+
+  __device__ void update_value(ValType& val, const GradType& grad) {  // NOLINT
     val.slot = grad.slot;
     val.show += grad.show;
     val.clk += grad.clk;
@@ -132,6 +137,7 @@ class Optimizer {
   }
 };
 
+#endif
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
index 55d0fc561c574..6d924a395e19a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
@@ -14,8 +14,16 @@ limitations under the License. */
 
 #pragma once
 
+#if defined(PADDLE_WITH_XPU_KP)
+#include "xpu/kernel/cluster_header.h"
+#include "xpu/kernel/debug.h"
+#include "xpu/kernel/math.h"
+#endif
+
 namespace optimizer_config {
 
+#if defined(PADDLE_WITH_CUDA)
+
 __constant__ float nonclk_coeff = 0.1;
 __constant__ float clk_coeff = 1;
 
@@ -31,4 +39,24 @@ __constant__ float mf_initial_g2sum = 3.0;
 __constant__ float mf_initial_range = 1e-4;
 __constant__ float mf_min_bound = -10;
 __constant__ float mf_max_bound = 10;
-}
+
+#elif defined(PADDLE_WITH_XPU_KP)
+
+_global_ptr_ float* nonclk_coeff;
+_global_ptr_ float* clk_coeff;
+
+_global_ptr_ float* min_bound;
+_global_ptr_ float* max_bound;
+_global_ptr_ float* learning_rate;
+_global_ptr_ float* initial_g2sum;
+_global_ptr_ float* initial_range;
+
+_global_ptr_ float* mf_create_thresholds;
+_global_ptr_ float* mf_learning_rate;
+_global_ptr_ float* mf_initial_g2sum;
+_global_ptr_ float* mf_initial_range;
+_global_ptr_ float* mf_min_bound;
+_global_ptr_ float* mf_max_bound;
+
+#endif
+}  // namespace optimizer_config
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 9551e49b6b77b..b7060764863f1 100755
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -146,7 +146,7 @@ class PSGPUWrapper {
       is_initialized_ = true;
       resource_ = std::make_shared<HeterPsResource>(dev_ids);
       resource_->enable_p2p();
-      keys_tensor.resize(resource_->total_gpu());
+      keys_tensor.resize(resource_->total_device());
 #ifdef PADDLE_WITH_GLOO
       auto gloo = paddle::framework::GlooWrapper::GetInstance();
       if (gloo->Size() > 1) {
@@ -312,8 +312,8 @@ class PSGPUWrapper {
     for (size_t i = 0; i < num_of_dim; i++) {
       dim_index_map[index_dim_vec_[i]] = i;
     }
-    hbm_pools_.resize(resource_->total_gpu() * num_of_dim);
-    mem_pools_.resize(resource_->total_gpu() * num_of_dim);
+    hbm_pools_.resize(resource_->total_device() * num_of_dim);
+    mem_pools_.resize(resource_->total_device() * num_of_dim);
     max_mf_dim_ = index_dim_vec_.back();
     multi_mf_dim_ = (dim_index_map.size() >= 1) ? dim_index_map.size() : 0;
     resource_->set_multi_mf(multi_mf_dim_, max_mf_dim_);

From b61fa16ad4fe65bd59131a18c4c353b162ee6f7a Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 13 Apr 2022 20:33:53 +0800
Subject: [PATCH 04/19] add split backward yaml (#41746)

---
 .../fluid/tests/unittests/test_split_op.py    | 27 ++++++++++++++++---
 python/paddle/utils/code_gen/api.yaml         |  1 +
 python/paddle/utils/code_gen/backward.yaml    |  2 +-
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index c826a0e1030f4..bf3be4080a9fc 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,6 +19,7 @@
 from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestSplitOp(OpTest):
@@ -402,12 +403,30 @@ def test_out1(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([4, 6, 6]).astype("int32")
             # input is a variable which shape is [4, 6, 6]
-            input = fluid.dygraph.to_variable(input_1)
+            input = paddle.to_tensor(input_1)
             x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1)
             x0_out = x0.numpy()
             x1_out = x1.numpy()
             x2_out = x2.numpy()
             ex_x0, ex_x1, ex_x2 = np.split(input_1, 3, axis=1)
+
+            with _test_eager_guard():
+                # input is a variable which shape is [4, 6, 6]
+                input = paddle.to_tensor(input_1)
+                input.stop_gradient = False
+                x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1)
+                eager_x0_out = x0.numpy()
+                eager_x1_out = x1.numpy()
+                eager_x2_out = x2.numpy()
+                loss = x0.sum()
+                loss.backward()
+                manul_grad = np.zeros_like(input_1)
+                manul_grad[:, :2, :] = 1
+                self.assertTrue(np.allclose(input.gradient(), manul_grad))
+                self.assertTrue(np.allclose(ex_x0, eager_x0_out))
+                self.assertTrue(np.allclose(ex_x1, eager_x1_out))
+                self.assertTrue(np.allclose(ex_x2, eager_x2_out))
+
         self.assertTrue(np.allclose(ex_x0, x0_out))
         self.assertTrue(np.allclose(ex_x1, x1_out))
         self.assertTrue(np.allclose(ex_x2, x2_out))
@@ -416,7 +435,7 @@ def test_out2(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([4, 6, 6]).astype("bool")
             # input is a variable which shape is [4, 6, 6]
-            input = fluid.dygraph.to_variable(input_1)
+            input = paddle.to_tensor(input_1)
             x0, x1, x2 = paddle.split(input, num_or_sections=3, axis=1)
             x0_out = x0.numpy()
             x1_out = x1.numpy()
@@ -430,7 +449,7 @@ def test_out_tensor_input(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([4, 6, 6]).astype("int32")
             # input is a variable which shape is [4, 6, 6]
-            input = fluid.dygraph.to_variable(input_1)
+            input = paddle.to_tensor(input_1)
             num1 = paddle.full(shape=[1], fill_value=2, dtype='int32')
             x0, x1, x2 = paddle.split(
                 input, num_or_sections=[num1, 2, 2], axis=1)
@@ -446,7 +465,7 @@ def test_axis_tensor_input(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([4, 6, 6]).astype("int32")
             # input is a variable which shape is [4, 6, 6]
-            input = fluid.dygraph.to_variable(input_1)
+            input = paddle.to_tensor(input_1)
             num1 = paddle.full(shape=[1], fill_value=1, dtype='int32')
             x0, x1, x2 = paddle.split(
                 input, num_or_sections=[2, 2, 2], axis=num1)
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index b4abe5b303b8e..f5245d59babd2 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1917,6 +1917,7 @@
   args : (Tensor x, IntArray num_or_sections, Scalar(int) axis)
   output : Tensor[]
   invoke : split_impl(x, num_or_sections, axis)
+  backward : split_grad
 
 - api : sqrt
   args : (Tensor x)
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index d0f337cb054f4..97c9c7ddf1584 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1523,7 +1523,7 @@
 
 - backward_api : split_grad
   forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out)
-  args : (Tensor[] out_grad, Scalar axis)
+  args : (Tensor[] out_grad, Scalar axis = -1)
   output : Tensor(x_grad)
   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.

From 27a91b1a6ea18f88355f0153f737056d4e4a3fb2 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 13 Apr 2022 22:03:30 +0800
Subject: [PATCH 05/19] Adjust the slice end in getitem (#41681)

* adjust the slice end in getitem

* fix bug

* fix bug

* fix bug

* recover start change
---
 python/paddle/fluid/variable_index.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 257ddc96d9c87..83a569aacc911 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -375,7 +375,13 @@ def _getitem_impl_(var, item):
             if start is None:
                 start = 0 if step > 0 else MAX_INTEGER
             if end is None:
-                end = MAX_INTEGER if step > 0 else -1
+                if var.shape[dim] != -1 and (
+                        paddle.fluid.framework._non_static_mode() or
+                        var.desc.type() != core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                ):
+                    end = var.shape[dim] if step > 0 else -1
+                else:
+                    end = MAX_INTEGER if step > 0 else -1
 
         elif isinstance(slice_item, list):
             all_bool = True

From b12af9e1d9980935f90ac3264797110f9671589e Mon Sep 17 00:00:00 2001
From: wangguanqun <esythan@126.com>
Date: Wed, 13 Apr 2022 23:32:14 +0800
Subject: [PATCH 06/19] the one ps proto (#41659)

* the one ps proto

* the one ps proto

* fix

* fix

* fix

* fix windows ci

* fix windows ci

* add dependency

* add dependency
---
 paddle/fluid/distributed/CMakeLists.txt       |  20 +-
 paddle/fluid/distributed/ps.proto             | 236 ------------------
 .../the_one_ps.proto                          |  23 ++
 paddle/fluid/framework/CMakeLists.txt         |   5 +-
 .../framework/distributed_strategy.proto      |   2 +-
 .../fleet/base/distributed_strategy.py        |   4 +-
 python/paddle/distributed/ps/the_one_ps.py    |  13 +-
 7 files changed, 51 insertions(+), 252 deletions(-)
 delete mode 100644 paddle/fluid/distributed/ps.proto
 rename paddle/fluid/{framework => distributed}/the_one_ps.proto (89%)
 mode change 100755 => 100644

diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 06b0583eddf24..0091c14bfd177 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -1,11 +1,29 @@
 add_subdirectory(collective)
 add_subdirectory(store)
+if(WITH_PYTHON)
+  py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
+  add_custom_target(ps_py_proto_init ALL  
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto)
+  if (NOT WIN32)
+    add_custom_command(TARGET ps_py_proto POST_BUILD
+      COMMAND mv the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/)
+  else(NOT WIN32)
+    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
+    add_custom_command(TARGET ps_py_proto POST_BUILD
+      COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath}
+      COMMENT "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}.")
+  endif(NOT WIN32)
+endif()
+
 if(NOT WITH_PSCORE)
     add_subdirectory(fleet_executor)
     return()
 endif()
 
-proto_library(ps_framework_proto SRCS ps.proto)
+proto_library(ps_framework_proto SRCS the_one_ps.proto)
+add_custom_command(TARGET ps_framework_proto POST_BUILD
+    COMMAND mv the_one_ps.pb.h ps.pb.h
+    COMMAND mv the_one_ps.pb.cc ps.pb.cc)
 
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
 
diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto
deleted file mode 100644
index 9bfa2c05efa67..0000000000000
--- a/paddle/fluid/distributed/ps.proto
+++ /dev/null
@@ -1,236 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto2";
-package paddle.distributed;
-option cc_generic_services = true;
-option cc_enable_arenas = true;
-
-message FsClientParameter {
-  enum FsApiType {
-    HDFS = 0;
-    AFS = 1;
-  }
-  optional FsApiType fs_type = 1 [ default = HDFS ];
-  optional string uri = 2;        // such as afs://xxx.afs.com:9902
-  optional string user = 3;       // user_name to access fs
-  optional string passwd = 4;     // password
-  optional int32 buffer_size = 5; // buffer for read/write
-  optional string hadoop_bin = 51;
-  optional string afs_conf = 101;
-}
-
-message PSParameter {
-  optional string worker_class = 1;
-  optional string server_class = 2;
-  optional string instance_class = 3;
-  optional string init_gflags = 4 [ default = "" ];
-  optional WorkerParameter worker_param = 101;
-  optional ServerParameter server_param = 102;
-  repeated DownpourTrainerParameter trainer_param = 301;
-  optional FsClientParameter fs_client_param = 501;
-}
-
-message WorkerParameter {
-  optional DownpourWorkerParameter downpour_worker_param = 1;
-}
-
-message DownpourWorkerParameter {
-  repeated TableParameter downpour_table_param = 1;
-}
-
-message DownpourServerParameter {
-  repeated TableParameter downpour_table_param = 1;
-  optional ServerServiceParameter service_param = 2;
-}
-
-message ServerParameter {
-  optional DownpourServerParameter downpour_server_param = 1;
-}
-
-message DownpourTrainerParameter {
-  repeated DenseTableParameter dense_table = 1;
-  repeated SparseTableParameter sparse_table = 2;
-  optional int32 push_sparse_per_batch = 3;
-  optional int32 push_dense_per_batch = 4;
-  repeated string skip_op = 5;
-  repeated ProgramConfig program_config = 6;
-}
-
-message DenseTableParameter {
-  optional int32 table_id = 1;
-  repeated string dense_variable_name = 2;
-  repeated string dense_gradient_variable_name = 3;
-  optional int32 fea_dim = 4;
-}
-
-message SparseTableParameter {
-  optional int32 table_id = 1;
-  optional int32 feature_dim = 2;
-  repeated string slot_key = 3;
-  repeated string slot_value = 4;
-  repeated string slot_gradient = 5;
-}
-
-message ServerServiceParameter {
-  optional string server_class = 1 [ default = "BrpcPsServer" ];
-  optional string client_class = 2 [ default = "BrpcPsClient" ];
-  optional string service_class = 3 [ default = "BrpcPsService" ];
-  optional uint32 start_server_port = 4
-      [ default = 0 ]; // will find a avaliable port from it
-  optional uint32 server_thread_num = 5 [ default = 12 ];
-}
-
-message ProgramConfig {
-  required string program_id = 1;
-  repeated int32 push_sparse_table_id = 2;
-  repeated int32 push_dense_table_id = 3;
-  repeated int32 pull_sparse_table_id = 4;
-  repeated int32 pull_dense_table_id = 5;
-}
-
-enum TableType {
-  PS_SPARSE_TABLE = 0;
-  PS_DENSE_TABLE = 1;
-  PS_OTHER_TABLE = 2;
-}
-
-message TableParameter {
-  optional uint64 table_id = 1;
-  optional string table_class = 2;
-  optional uint64 shard_num = 3 [ default = 1000 ];
-  optional TableAccessorParameter accessor = 4;
-  optional TensorAccessorParameter tensor = 5;
-  optional CommonAccessorParameter common = 6;
-  optional TableType type = 7;
-  optional bool compress_in_save = 8 [ default = false ];
-  optional GraphParameter graph_parameter = 9;
-}
-
-message TableAccessorParameter {
-  optional string accessor_class = 1;
-  optional uint32 fea_dim = 4 [ default = 11 ];
-  optional uint32 embedx_dim = 5 [ default = 8 ];
-  optional uint32 embedx_threshold = 6 [ default = 10 ];
-  optional CtrAccessorParameter ctr_accessor_param = 7;
-  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
-  optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
-  optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
-}
-
-message CtrAccessorParameter {
-  optional float nonclk_coeff = 1
-      [ default = 0.1 ]; // to calculate show_click_score
-  optional float click_coeff = 2
-      [ default = 1 ]; // to calculate show_click_score
-  optional float base_threshold = 3 [
-    default = 1.5
-  ]; // show_click_score > base_threshold, this feature can be saved
-  optional float delta_threshold = 4
-      [ default =
-            0.25 ]; // delta_score > delta_threshold, this feature can be saved
-  optional float delta_keep_days = 5
-      [ default =
-            16 ]; // unseen_day < delta_keep_days, this feature can be saved
-  optional float show_click_decay_rate = 6 [
-    default = 0.98
-  ]; // show/click will update to show/click * show_click_decay_rate after a day
-  optional float delete_threshold = 7
-      [ default = 0.8 ]; // threshold to shrink a feasign
-  optional float delete_after_unseen_days = 8
-      [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature
-                        // will be delete in shrink_model
-  optional int32 ssd_unseenday_threshold = 9
-      [ default = 1 ]; // threshold to save ssd
-}
-
-message TensorAccessorParameter {
-  optional string feed_var_name = 1;
-  optional string fetch_var_name = 2;
-  optional int64 startup_program_id = 3;
-  optional int64 main_program_id = 4;
-  optional string tensor_table_class = 6;
-}
-
-message CommonAccessorParameter {
-  optional string name = 1;
-  optional string table_name = 2;
-  repeated string attributes = 3;
-  repeated string params = 4;
-  repeated uint32 dims = 5;
-  repeated string initializers = 6;
-  optional string entry = 7;
-  optional int32 trainer_num = 8;
-  optional bool sync = 9;
-  optional uint32 table_num = 10;
-  optional uint32 table_dim = 11;
-}
-
-message TableAccessorSaveParameter {
-  optional uint32 param = 1;
-  optional string converter = 2;
-  optional string deconverter = 3;
-}
-
-message SparseCommonSGDRuleParameter {
-  optional string name = 1;
-  optional SparseNaiveSGDRuleParameter naive = 2;
-  optional SparseAdagradSGDRuleParameter adagrad = 3;
-  optional SparseAdamSGDParameter adam = 4;
-}
-
-message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
-  optional double learning_rate = 1 [ default = 0.05 ];
-  optional double initial_range = 2 [ default = 0.0001 ];
-  repeated float weight_bounds = 3;
-}
-
-message
-    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
-  optional double learning_rate = 1 [ default = 0.05 ];
-  optional double initial_g2sum = 2 [ default = 3.0 ];
-  optional double initial_range = 3 [ default = 0.0001 ];
-  repeated float weight_bounds = 4;
-}
-
-message SparseAdamSGDParameter { // SparseAdamSGDRule
-  optional double learning_rate = 1 [ default = 0.001 ];
-  optional double initial_range = 2 [ default = 0.0001 ];
-  optional double beta1_decay_rate = 3 [ default = 0.9 ];
-  optional double beta2_decay_rate = 4 [ default = 0.999 ];
-  optional double ada_epsilon = 5 [ default = 1e-08 ];
-  repeated float weight_bounds = 6;
-}
-
-message GraphParameter {
-  optional int32 task_pool_size = 1 [ default = 24 ];
-  optional bool gpups_mode = 2 [ default = false ];
-  optional string gpups_graph_sample_class = 3
-      [ default = "CompleteGraphSampler" ];
-  optional string gpups_graph_sample_args = 4 [ default = "" ];
-  optional bool use_cache = 5 [ default = false ];
-  optional int32 cache_size_limit = 6 [ default = 100000 ];
-  optional int32 cache_ttl = 7 [ default = 5 ];
-  optional GraphFeature graph_feature = 8;
-  optional string table_name = 9 [ default = "" ];
-  optional string table_type = 10 [ default = "" ];
-  optional int32 shard_num = 11 [ default = 127 ];
-  optional int32 gpu_num = 12 [ default = 1 ];
-}
-
-message GraphFeature {
-  repeated string name = 1;
-  repeated string dtype = 2;
-  repeated int32 shape = 3;
-}
\ No newline at end of file
diff --git a/paddle/fluid/framework/the_one_ps.proto b/paddle/fluid/distributed/the_one_ps.proto
old mode 100755
new mode 100644
similarity index 89%
rename from paddle/fluid/framework/the_one_ps.proto
rename to paddle/fluid/distributed/the_one_ps.proto
index 0ae87812bce43..34b11dfd1c5b7
--- a/paddle/fluid/framework/the_one_ps.proto
+++ b/paddle/fluid/distributed/the_one_ps.proto
@@ -115,6 +115,7 @@ message TableParameter {
   optional CommonAccessorParameter common = 6;
   optional TableType type = 7;
   optional bool compress_in_save = 8 [ default = false ];
+  optional GraphParameter graph_parameter = 9;
 }
 
 message TableAccessorParameter {
@@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule
   optional double ada_epsilon = 5 [ default = 1e-08 ];
   repeated float weight_bounds = 6;
 }
+
+message GraphParameter {
+  optional int32 task_pool_size = 1 [ default = 24 ];
+  optional bool gpups_mode = 2 [ default = false ];
+  optional string gpups_graph_sample_class = 3
+      [ default = "CompleteGraphSampler" ];
+  optional string gpups_graph_sample_args = 4 [ default = "" ];
+  optional bool use_cache = 5 [ default = false ];
+  optional int32 cache_size_limit = 6 [ default = 100000 ];
+  optional int32 cache_ttl = 7 [ default = 5 ];
+  optional GraphFeature graph_feature = 8;
+  optional string table_name = 9 [ default = "" ];
+  optional string table_type = 10 [ default = "" ];
+  optional int32 shard_num = 11 [ default = 127 ];
+  optional int32 gpu_num = 12 [ default = 1 ];
+}
+
+message GraphFeature {
+  repeated string name = 1;
+  repeated string dtype = 2;
+  repeated int32 shape = 3;
+}
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 1b9943df1b087..ad9f37b98bd3d 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -237,7 +237,6 @@ if(WITH_PYTHON)
   py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
   py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
-  py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
   add_custom_target(fleet_proto_init ALL  
@@ -245,13 +244,12 @@ if(WITH_PYTHON)
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
   )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto ps_py_proto_init)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-      COMMAND cp the_one_ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
@@ -263,7 +261,6 @@ if(WITH_PYTHON)
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
           COMMAND copy /Y *.py ${proto_dstpath}
-      COMMAND copy /Y the_one_ps_pb2.py ${fleet_proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
 	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index c94a344f74d8d..9b0a033856d73 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -310,7 +310,7 @@ message DistributedStrategy {
   optional bool asp = 33 [ default = false ];
   optional bool fuse_grad_merge = 34 [ default = false ];
   optional bool semi_auto = 35 [ default = false ];
-  optional bool adam_d2sum = 36 [ default = true ];
+  optional bool adam_d2sum = 36 [ default = false ];
   optional bool auto_search = 37 [ default = false ];
   optional bool heter_ccl_mode = 38 [ default = false ];
 
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 199418ab77955..c46b6eeb048a0 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -404,7 +404,7 @@ def trainer_desc_configs(self):
     def adam_d2sum(self):
         """
         set adam_d2sum
-        Default value: True
+        Default value: False
 
         Examples:
 
@@ -415,7 +415,7 @@ def adam_d2sum(self):
             fleet.init(role_maker)
 
             strategy = fleet.DistributedStrategy()
-            strategy.adam_d2sum = True  # by default this is True
+            strategy.adam_d2sum = True  # by default this is False
 
             # code block for defining loss and local optimizer
             # sgd = fleet.distributed_optimizer(optimizer, strategy)
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 1d23567b72abe..5be739785ff44 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -609,7 +609,6 @@ def _set(self, table_proto):
         check_embedding_dim(table_proto.accessor, self.common.table_name,
                             ctx.program_id(), self.context)
 
-        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
         self.common.parse_by_optimizer(ctx, self.context)
         self.common.parse_entry(self.common.table_name,
                                 ctx.program_id(), self.context)
@@ -641,7 +640,6 @@ def _set(self, table_proto):
 
         self.common.table_name = self.context['grad_name_to_param_name'][
             ctx.origin_varnames()[0]]
-        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
         self.common.parse_by_optimizer(ctx, self.context)
         self.common.parse_entry(self.common.table_name,
                                 ctx.program_id(), self.context)
@@ -673,7 +671,6 @@ def _set(self, table_proto):
         table_proto.accessor.embedx_dim = 1
 
         self.common.table_name = "MergedDense"
-        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
         self.common.parse_by_optimizer(ctx, self.context)
         self.common.parse_entry(self.common.table_name,
                                 ctx.program_id(), self.context)
@@ -922,11 +919,6 @@ def sync_strategy_envs():
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = worker_desc
-        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-        if debug:
-            print("worker: \n{}".format(proto_txt))
-
         dense_map = get_the_one_recv_context(
             self.context, split_dense_table=self.is_heter_ps_mode)
         send_ctx = get_the_one_send_context(
@@ -937,6 +929,7 @@ def sync_strategy_envs():
         self._send_ctx = send_ctx
         trainer_config = self.context['trainer']
 
+        proto_txt = worker_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("worker: \n{}".format(proto_txt))
@@ -1060,6 +1053,10 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
         if self.is_heter_ps_mode:
             trainers += len(self.role_maker._get_heter_worker_endpoints())
 
+        # debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
+        # if debug:
+        #     print("server: \n{}".format(server_desc))
+
         self._server = fluid.core.DistFleetWrapper()
         self._server.init_server(server_desc, self.string_hosts, role_id,
                                  trainers, self._server_sub_program)

From 97dec7ca51bd29f913a31f52fd618e2d364dbeed Mon Sep 17 00:00:00 2001
From: levi131 <83750468+levi131@users.noreply.github.com>
Date: Wed, 13 Apr 2022 23:54:14 +0800
Subject: [PATCH 07/19] Lml/add prim ops (#41201)

* native commit for triple grad of sigmod

* Updated unittests files

* init functional jacobian api

* Updated trible_test func

* Updated gradient_checker & test_script

* finish test with dtype float32

* add float64 test case

* polish code

* use atol=1e-5 with dtype float64

* fix for ci

* set timeout for test_jacobian

* fix dygraph grad to support high differential

* polish API docstring

* Updated gradient checker and some related files

* fix double grad strip error for high differential

* fix double grad strip error for high differential

* Add Sigmoid triple grad tests

* fix dygraph double grad dtype error when calling for high differential senario

* Updated triple grad teses func

* Use np.random to initialize ddx

* Updated triple_grad_check func

* add todo for gradient checker and refine some comments

* remove additional code

* add test for warnging in backward.py

* format python code

* support multi input in triple gradient checker

* Add matmul triple grad kernel

* Updated comments of TODO

* Supported some special tests

* Change code-format to follow CI std

* Updated gradient_checker.py

* Fix conflicts

* Removed unnecessary printing log

* Change code style to follow CI std

* merge upstream

* add_p

* rm useless files

* add sub_p mul_p div_p

* add sqrt_p and tanh_p

* add reshape_p

* add broadcast_p

* add broadcast_p fill_constant_p matmul_p reduce_p reshape_p transpose_p

* add split_p and concat_p

* add gather_p and scatter_add_p

* add slice_select_p and slice_assign_p

* add multi input check for add_p, sub_p, mul_p, div_p

* update concat_p

* refine gather_p and scatter_add_p

* refine slice_assign_p and slice_select_p

* add 9 test for prim ops

* add more test and fix some bug

* add more test

* register proto

* add shape valid check for broadcast_p op, and add keepdim attr into reduce_p op proto

* support multi input and multi output for split_p and concat_p

* fix slice bug for slice_select_p and slice_assign_p

* dtype for axis attr should be long int

* update dtype for axis attr int64_t

* update for iscan CI

* add more shape and dtype check

* change IndexTensor into int32 dtype
---
 paddle/fluid/operators/CMakeLists.txt         |   1 +
 .../fluid/operators/prim_ops/CMakeLists.txt   |  28 +
 paddle/fluid/operators/prim_ops/add_p_op.cc   | 116 ++++
 .../operators/prim_ops/broadcast_p_op.cc      | 110 ++++
 .../fluid/operators/prim_ops/concat_p_op.cc   | 134 +++++
 paddle/fluid/operators/prim_ops/div_p_op.cc   | 116 ++++
 .../operators/prim_ops/fill_constant_p_op.cc  |  81 +++
 .../fluid/operators/prim_ops/gather_p_op.cc   | 117 ++++
 .../fluid/operators/prim_ops/matmul_p_op.cc   | 138 +++++
 paddle/fluid/operators/prim_ops/mul_p_op.cc   | 116 ++++
 .../fluid/operators/prim_ops/prim_op_test.cc  | 553 ++++++++++++++++++
 .../fluid/operators/prim_ops/reduce_p_op.cc   | 107 ++++
 .../fluid/operators/prim_ops/reshape_p_op.cc  |  97 +++
 .../operators/prim_ops/scatter_add_p_op.cc    | 160 +++++
 .../operators/prim_ops/slice_assign_p_op.cc   | 152 +++++
 .../operators/prim_ops/slice_select_p_op.cc   | 115 ++++
 paddle/fluid/operators/prim_ops/split_p_op.cc | 119 ++++
 paddle/fluid/operators/prim_ops/sqrt_p_op.cc  |  80 +++
 paddle/fluid/operators/prim_ops/sub_p_op.cc   | 116 ++++
 paddle/fluid/operators/prim_ops/tanh_p_op.cc  |  80 +++
 .../operators/prim_ops/transpose_p_op.cc      | 116 ++++
 .../operators/prim_ops/unity_build_rule.cmake |  20 +
 22 files changed, 2672 insertions(+)
 create mode 100644 paddle/fluid/operators/prim_ops/CMakeLists.txt
 create mode 100644 paddle/fluid/operators/prim_ops/add_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/broadcast_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/concat_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/div_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/fill_constant_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/gather_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/matmul_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/mul_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/prim_op_test.cc
 create mode 100644 paddle/fluid/operators/prim_ops/reduce_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/reshape_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/scatter_add_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/slice_assign_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/slice_select_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/split_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/sqrt_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/sub_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/tanh_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/transpose_p_op.cc
 create mode 100644 paddle/fluid/operators/prim_ops/unity_build_rule.cmake

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 68eaf1a0ed469..63bf3ab6a0382 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,6 +22,7 @@ add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(string)
 add_subdirectory(jit)
+add_subdirectory(prim_ops)
 if(WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt
new file mode 100644
index 0000000000000..a58ee6dc1f7ba
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt
@@ -0,0 +1,28 @@
+include(operators)
+if(WITH_UNITY_BUILD)
+    # Load Unity Build rules for operators in paddle/fluid/operators/prim_ops.
+    include(unity_build_rule.cmake)
+endif()
+register_operators()
+
+SET(PRIM_OP_SRCS
+    reshape_p_op.cc
+    broadcast_p_op.cc
+    reduce_p_op.cc
+    transpose_p_op.cc
+    split_p_op.cc
+    concat_p_op.cc
+    slice_select_p_op.cc
+    slice_assign_p_op.cc
+    gather_p_op.cc
+    scatter_add_p_op.cc
+    add_p_op.cc
+    sub_p_op.cc
+    mul_p_op.cc
+    div_p_op.cc
+    sqrt_p_op.cc
+    tanh_p_op.cc
+    matmul_p_op.cc
+    fill_constant_p_op.cc)
+
+cc_test(prim_op_test SRCS prim_op_test.cc ${PRIM_OP_SRCS} DEPS op_registry)
diff --git a/paddle/fluid/operators/prim_ops/add_p_op.cc b/paddle/fluid/operators/prim_ops/add_p_op.cc
new file mode 100644
index 0000000000000..4789ed8958f91
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/add_p_op.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class AddPrimOp : public framework::OperatorBase {
+ public:
+  AddPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator add_p should not be excuted directly"));
+  }
+};
+
+class AddPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of add_p op.");
+    AddInput("Y", "(Tensor), The input tensor of add_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of add_p op.");
+    AddComment(R"DOC(
+Autograd primitive add_p operator.
+)DOC");
+  }
+};
+
+class AddPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank, y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i], y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i, x_shape[i], y_shape[i]));
+    }
+
+    BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class AddPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, x_dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(add_p, paddle::operators::AddPrimOp,
+                  paddle::operators::AddPrimOpMaker,
+                  paddle::operators::AddPrimOpShapeInference,
+                  paddle::operators::AddPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/broadcast_p_op.cc b/paddle/fluid/operators/prim_ops/broadcast_p_op.cc
new file mode 100644
index 0000000000000..5459b73911473
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/broadcast_p_op.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class BroadcastPrimOp : public framework::OperatorBase {
+ public:
+  BroadcastPrimOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator broadcast_p should not be excuted directly"));
+  }
+};
+
+class BroadcastPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of broadcast_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of broadcast_p op.");
+    AddAttr<std::vector<int64_t>>(
+        "shape",
+        "(std::vector<int64_t>) Target shape of broadcast_p operator.");
+    AddComment(R"DOC(
+Autograd primitive broadcast_p operator.
+)DOC");
+  }
+};
+
+static void CheckShapeValid(const std::vector<int64_t> &x_shape,
+                            const std::vector<int64_t> &target_shape) {
+  size_t x_rank = x_shape.size();
+  size_t target_rank = target_shape.size();
+  PADDLE_ENFORCE_GE(target_rank, x_rank,
+                    platform::errors::InvalidArgument(
+                        "The rank of target shape should be greater than or "
+                        "equal to input tensor's dimensions, "
+                        "but received %d and %d",
+                        target_rank, x_rank));
+  std::vector<int64_t>::const_iterator it = target_shape.begin();
+  for (size_t i = 0; i < x_rank; i++, it++) {
+    if (x_shape[i] != 1) {
+      it = std::find(it, target_shape.end(), x_shape[i]);
+    }
+    PADDLE_ENFORCE_EQ(
+        it != target_shape.end(), true,
+        platform::errors::InvalidArgument(
+            "Invalid shape, can not broadcast input tensor into target shape,"
+            "the first dismatching shape  %d is shape of input tensor at "
+            "dimension %d",
+            x_shape[i], i));
+  }
+}
+
+class BroadcastPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto target_shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    CheckShapeValid(x_shape, target_shape);
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(target_shape);
+  }
+};
+
+class BroadcastPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(broadcast_p, paddle::operators::BroadcastPrimOp,
+                  paddle::operators::BroadcastPrimOpMaker,
+                  paddle::operators::BroadcastPrimOpShapeInference,
+                  paddle::operators::BroadcastPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/concat_p_op.cc b/paddle/fluid/operators/prim_ops/concat_p_op.cc
new file mode 100644
index 0000000000000..24516356a2836
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/concat_p_op.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class ConcatPrimOp : public framework::OperatorBase {
+ public:
+  ConcatPrimOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator concat_p should not be excuted directly"));
+  }
+};
+
+class ConcatPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("XS", "(Tensor), The input tensors of concat_p op.")
+        .AsDuplicable();
+    AddOutput("Y", "(Tensor), The output tensor of concat_p op.");
+    AddAttr<int64_t>("axis", "(int64_t), The axis along which to concat.");
+    AddComment(R"DOC(
+Autograd primitive concat_p operator.
+)DOC");
+  }
+};
+
+class ConcatPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    auto x_var_ptrs = ctx->GetInputVarPtrs("XS");
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    auto axis = ctx->Attrs().Get<int64_t>("axis");
+    int64_t cnt_along_axis = 0;
+    framework::VarDesc *first_x_var =
+        BOOST_GET(framework::VarDesc *, x_var_ptrs[0]);
+    auto first_x_shape = first_x_var->GetShape();
+    cnt_along_axis += first_x_shape[axis];
+    size_t first_x_rank = first_x_shape.size();
+    for (size_t i = 1; i < x_var_ptrs.size(); ++i) {
+      framework::VarDesc *x_var =
+          BOOST_GET(framework::VarDesc *, x_var_ptrs[i]);
+      auto x_shape = x_var->GetShape();
+      cnt_along_axis += x_shape[axis];
+      size_t x_rank = x_shape.size();
+      PADDLE_ENFORCE_EQ(
+          x_rank, first_x_rank,
+          platform::errors::InvalidArgument("The dimensions of %d input tensor "
+                                            "should be same as the dimensions "
+                                            "of 1st input tensor's, "
+                                            "but get %d and %d",
+                                            i + 1, x_rank, first_x_rank));
+      for (size_t j = 0; j < x_rank; ++j) {
+        if (j != size_t(axis)) {
+          PADDLE_ENFORCE_EQ(x_shape[j], first_x_shape[j],
+                            platform::errors::InvalidArgument(
+                                "The shape of %d input tensor at dimension %d "
+                                "should be same as the 1st input tensor's, "
+                                "but get %d and %d",
+                                i + 1, j, x_shape[j], first_x_shape[j]));
+        }
+      }
+    }
+
+    std::vector<int64_t> y_shape(first_x_shape);
+    y_shape[axis] = cnt_along_axis;
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(y_shape);
+  }
+};
+
+class ConcatPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_names = Input(ctx, "XS");
+    auto y_name = Output(ctx, "Y")[0];
+    auto first_x_name = x_names[0];
+    auto first_x_type = GetType(ctx, first_x_name);
+    auto first_x_dtype = GetDataType(ctx, first_x_name);
+    for (size_t i = 1; i < x_names.size(); ++i) {
+      auto x_name = x_names[i];
+      auto x_type = GetType(ctx, x_name);
+      auto x_dtype = GetDataType(ctx, x_name);
+      PADDLE_ENFORCE_EQ(x_type, first_x_type,
+                        platform::errors::InvalidArgument(
+                            "The type of %d input tensor should be same as the "
+                            "first input tensor's, "
+                            "but get %d and %d",
+                            i + 1, x_type, first_x_type));
+      PADDLE_ENFORCE_EQ(x_dtype, first_x_dtype,
+                        platform::errors::InvalidArgument(
+                            "The datatype of %d input tensor should be same as "
+                            "the first input tensor's, "
+                            "but get %d and %d",
+                            i + 1, x_dtype, first_x_dtype));
+    }
+    SetType(ctx, y_name, GetType(ctx, first_x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, first_x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(concat_p, paddle::operators::ConcatPrimOp,
+                  paddle::operators::ConcatPrimOpMaker,
+                  paddle::operators::ConcatPrimOpShapeInference,
+                  paddle::operators::ConcatPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/div_p_op.cc b/paddle/fluid/operators/prim_ops/div_p_op.cc
new file mode 100644
index 0000000000000..35ae1f69cd2c8
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/div_p_op.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class DivPrimOp : public framework::OperatorBase {
+ public:
+  DivPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator div_p should not be excuted directly"));
+  }
+};
+
+class DivPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of div_p op.");
+    AddInput("Y", "(Tensor), The input tensor of div_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of div_p op.");
+    AddComment(R"DOC(
+Autograd primitive div_p operator.
+)DOC");
+  }
+};
+
+class DivPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank, y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i], y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i, x_shape[i], y_shape[i]));
+    }
+
+    BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class DivPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, x_dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(div_p, paddle::operators::DivPrimOp,
+                  paddle::operators::DivPrimOpMaker,
+                  paddle::operators::DivPrimOpShapeInference,
+                  paddle::operators::DivPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc b/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc
new file mode 100644
index 0000000000000..9831599e46ccc
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/fill_constant_p_op.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class FillConstantPrimOp : public framework::OperatorBase {
+ public:
+  FillConstantPrimOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator fill_constant_p should not be excuted directly"));
+  }
+};
+
+class FillConstantPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("Y", "(Tensor), The output tensor of fill_constant_p op.");
+    AddAttr<float>("value", "(float) The value of output tensor.");
+    AddAttr<std::vector<int64_t>>(
+        "shape", "(std::vector<int64_t>) The shape of output tensor.");
+    AddAttr<int>("dtype", "(int) The dtype of output tensor.");
+    AddComment(R"DOC(
+Autograd primitive fill_constant_p operator.
+)DOC");
+  }
+};
+
+class FillConstantPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape);
+  }
+};
+
+class FillConstantPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto y_name = Output(ctx, "Y")[0];
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        BOOST_GET_CONST(int, ctx->GetAttr("dtype")));
+    SetDataType(ctx, y_name, data_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(fill_constant_p, paddle::operators::FillConstantPrimOp,
+                  paddle::operators::FillConstantPrimOpMaker,
+                  paddle::operators::FillConstantPrimOpShapeInference,
+                  paddle::operators::FillConstantPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/gather_p_op.cc b/paddle/fluid/operators/prim_ops/gather_p_op.cc
new file mode 100644
index 0000000000000..be777de055803
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/gather_p_op.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class GatherPrimOp : public framework::OperatorBase {
+ public:
+  GatherPrimOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator gather_p should not be excuted directly"));
+  }
+};
+
+class GatherPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of gather_p op.");
+    AddInput("IndexTensor",
+             "(Tensor), The index tensor of gather_p op, which is a 1D tensor.")
+        .AsDispensable();
+    AddOutput("Y", "(Tensor), The output tensor of gather_p op.");
+    AddAttr<int64_t>("axis", "(int64_t), The axis along which to gather.");
+    AddAttr<std::vector<int64_t>>(
+        "index", "(std::vector<int64_t>) The index of gather_p op")
+        .SetDefault({0});
+    AddComment(R"DOC(
+Autograd primitive gather_p operator.
+)DOC");
+  }
+};
+
+class GatherPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    int64_t num_index = 0;
+    if (ctx->HasInput("IndexTensor")) {
+      framework::InferShapeVarPtr index_var_ptr =
+          ctx->GetInputVarPtrs("IndexTensor")[0];
+      framework::VarDesc *index_var =
+          BOOST_GET(framework::VarDesc *, index_var_ptr);
+      auto index_shape = index_var->GetShape();
+      PADDLE_ENFORCE_EQ(index_shape.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The index tensor should be a 1D tensor,"
+                            "but get rank %d",
+                            index_shape.size()));
+      num_index = index_shape[0];
+    } else {
+      num_index = ctx->Attrs().Get<std::vector<int64_t>>("index").size();
+    }
+    auto axis = ctx->Attrs().Get<int64_t>("axis");
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    x_shape[axis] = num_index;
+
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class GatherPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    if (ctx->HasInput("IndexTensor")) {
+      auto index_name = Input(ctx, "IndexTensor")[0];
+      auto index_dtype = GetDataType(ctx, index_name);
+      PADDLE_ENFORCE_EQ(
+          index_dtype, framework::proto::VarType_Type_INT32,
+          platform::errors::InvalidArgument(
+              "The datatype of input tensor should be VarType_Type_INT32(%d), "
+              "but get %d",
+              framework::proto::VarType_Type_INT32, index_dtype));
+    }
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(gather_p, paddle::operators::GatherPrimOp,
+                  paddle::operators::GatherPrimOpMaker,
+                  paddle::operators::GatherPrimOpShapeInference,
+                  paddle::operators::GatherPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/matmul_p_op.cc b/paddle/fluid/operators/prim_ops/matmul_p_op.cc
new file mode 100644
index 0000000000000..1a28e1ca5c427
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/matmul_p_op.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class MatmulPrimOp : public framework::OperatorBase {
+ public:
+  MatmulPrimOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator matmul_p should not be excuted directly"));
+  }
+};
+
+class MatmulPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of matmul_p op.");
+    AddInput("Y", "(Tensor), The input tensor of matmul_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of matmul_p op.");
+    AddComment(R"DOC(
+Autograd primitive matmul_p operator.
+)DOC");
+  }
+};
+
+class MatmulPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The two input tensor's dimension should be equal"
+                          "But received first input tensor's dimension is %d, "
+                          "and another input tensor's dimension is %d",
+                          x_rank, y_rank));
+
+    PADDLE_ENFORCE_EQ(x_rank == 2 || x_rank == 3, true,
+                      platform::errors::InvalidArgument(
+                          "The input tensor's dimension should be 2 or 3"
+                          "But received input tensor's dimension is %d",
+                          x_rank));
+
+    PADDLE_ENFORCE_EQ(
+        x_shape[x_rank - 1], y_shape[y_rank - 2],
+        platform::errors::InvalidArgument(
+            "Invalid shape for matmul, the last dimension of first input and "
+            "the penultimate dimension for the second input should be same."
+            "But received  %d and %d.",
+            x_shape[x_rank - 1], y_shape[y_rank - 2]));
+    if (x_rank == 2) {
+      std::vector<int64_t> z_shape{x_shape[x_rank - 2], y_shape[y_rank - 1]};
+      BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(z_shape);
+    } else {
+      PADDLE_ENFORCE_EQ(x_shape[0], y_shape[0],
+                        platform::errors::InvalidArgument(
+                            "Invalid shape for matmul when input tensor's "
+                            "dimension is 3, the first dimension of first "
+                            "input and the second input should be same."
+                            "But received  %d and %d.",
+                            x_shape[0], y_shape[0]));
+
+      std::vector<int64_t> z_shape{x_shape[0], x_shape[x_rank - 2],
+                                   y_shape[y_rank - 1]};
+      BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(z_shape);
+    }
+  }
+};
+
+class MatmulPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, x_dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(matmul_p, paddle::operators::MatmulPrimOp,
+                  paddle::operators::MatmulPrimOpMaker,
+                  paddle::operators::MatmulPrimOpShapeInference,
+                  paddle::operators::MatmulPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/mul_p_op.cc b/paddle/fluid/operators/prim_ops/mul_p_op.cc
new file mode 100644
index 0000000000000..a60e2601a339b
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/mul_p_op.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class MulPrimOp : public framework::OperatorBase {
+ public:
+  MulPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator mul_p should not be excuted directly"));
+  }
+};
+
+class MulPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of mul_p op.");
+    AddInput("Y", "(Tensor), The input tensor of mul_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of mul_p op.");
+    AddComment(R"DOC(
+Autograd primitive mul_p operator.
+)DOC");
+  }
+};
+
+class MulPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank, y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i], y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i, x_shape[i], y_shape[i]));
+    }
+
+    BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class MulPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, x_dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(mul_p, paddle::operators::MulPrimOp,
+                  paddle::operators::MulPrimOpMaker,
+                  paddle::operators::MulPrimOpShapeInference,
+                  paddle::operators::MulPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc
new file mode 100644
index 0000000000000..2d65149d130bb
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc
@@ -0,0 +1,553 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+USE_OP_ITSELF(reshape_p);
+USE_OP_ITSELF(broadcast_p);
+USE_OP_ITSELF(reduce_p);
+USE_OP_ITSELF(transpose_p);
+USE_OP_ITSELF(split_p);
+USE_OP_ITSELF(concat_p);
+USE_OP_ITSELF(slice_select_p);
+USE_OP_ITSELF(slice_assign_p);
+USE_OP_ITSELF(gather_p);
+USE_OP_ITSELF(scatter_add_p);
+USE_OP_ITSELF(add_p);
+USE_OP_ITSELF(sub_p);
+USE_OP_ITSELF(mul_p);
+USE_OP_ITSELF(div_p);
+USE_OP_ITSELF(sqrt_p);
+USE_OP_ITSELF(tanh_p);
+USE_OP_ITSELF(matmul_p);
+USE_OP_ITSELF(fill_constant_p);
+
+namespace paddle {
+namespace framework {
+
+static void NewVar(BlockDesc *block, const std::string &name,
+                   const std::vector<int64_t> &shape) {
+  auto *var_desc = block->Var(name);
+  if (shape.size() > 0) {
+    var_desc->SetShape(shape);
+    var_desc->SetType(proto::VarType::LOD_TENSOR);
+    var_desc->SetDataType(proto::VarType_Type_FP32);
+  }
+}
+
+static void AppendOp(BlockDesc *block, const std::string &type,
+                     VariableNameMap inputs, VariableNameMap outputs,
+                     AttributeMap attrs) {
+  auto &op_info = OpInfoMap::Instance().Get(type);
+  if (op_info.Checker()) {
+    op_info.Checker()->Check(&attrs);
+  }
+
+  auto *op = block->AppendOp();
+  op->SetType(type);
+  for (auto &pair : inputs) {
+    op->SetInput(pair.first, pair.second);
+  }
+
+  for (auto &pair : outputs) {
+    op->SetOutput(pair.first, pair.second);
+    for (auto &var_name : pair.second) {
+      if (!block->FindVarRecursive(var_name)) {
+        NewVar(block, var_name, {});
+      }
+    }
+  }
+
+  op->SetAttrMap(attrs);
+  op->InferVarType(block);
+  op->InferShape(*block);
+}
+
+TEST(PrimOp, reshape_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "reshape_p", {{"X", {x0}}}, {{"Y", {x1}}},
+           {{"shape", std::vector<int64_t>{12, 5}}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 2UL);
+  ASSERT_EQ(shapes[0], 12L);
+  ASSERT_EQ(shapes[1], 5L);
+}
+
+TEST(PrimOp, broadcast_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 1};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "broadcast_p", {{"X", {x0}}}, {{"Y", {x1}}},
+           {{"shape", std::vector<int64_t>{3, 4, 5}}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, reduce_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "reduce_p", {{"X", {x0}}}, {{"Y", {x1}}},
+           {{"axis", std::vector<int64_t>{0, 2}}, {"keepdim", false}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 1UL);
+  ASSERT_EQ(shapes[0], 4L);
+  AppendOp(block, "reduce_p", {{"X", {x0}}}, {{"Y", {x2}}},
+           {{"axis", std::vector<int64_t>{0, 2}}, {"keepdim", true}});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 1L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 1L);
+}
+
+TEST(PrimOp, transpose_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "transpose_p", {{"X", {x0}}}, {{"Y", {x1}}},
+           {{"axis", std::vector<int64_t>{2, 1, 0}}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 5L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 3L);
+}
+
+TEST(PrimOp, split_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{6, 8, 10};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+  std::string x3 = "x3";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "split_p", {{"X", {x0}}}, {{"YS", {x1, x2, x3}}},
+           {{"axis", int64_t{1}},
+            {"num_or_sections", std::vector<int64_t>{2, 4, 2}}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 2L);
+  ASSERT_EQ(shapes[2], 10L);
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 10L);
+  ASSERT_EQ(block->Var("x3")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x3")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x3")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 2L);
+  ASSERT_EQ(shapes[2], 10L);
+  std::string x4 = "x4";
+  std::string x5 = "x5";
+  AppendOp(
+      block, "split_p", {{"X", {x0}}}, {{"YS", {x4, x5}}},
+      {{"axis", int64_t{2}}, {"num_or_sections", std::vector<int64_t>{2}}});
+  ASSERT_EQ(block->Var("x4")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x4")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x4")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 8L);
+  ASSERT_EQ(shapes[2], 5L);
+  ASSERT_EQ(block->Var("x5")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x5")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x5")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 8L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, concat_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape_0{3, 1, 5};
+  std::vector<int64_t> shape_1{3, 4, 5};
+  std::vector<int64_t> shape_2{3, 6, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+  std::string x3 = "x3";
+
+  NewVar(block, x0, shape_0);
+  NewVar(block, x1, shape_1);
+  NewVar(block, x2, shape_2);
+  AppendOp(block, "concat_p", {{"XS", {x0, x1, x2}}}, {{"Y", {x3}}},
+           {{"axis", int64_t{1}}});
+  ASSERT_EQ(block->Var("x3")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x3")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x3")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 11L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, slice_select_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{6, 8, 10};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "slice_select_p", {{"X", {x0}}}, {{"Y", {x1}}},
+           {{"axis", std::vector<int64_t>{0, 1, 2}},
+            {"starts", std::vector<int64_t>{0, 0, 0}},
+            {"ends", std::vector<int64_t>{5, 7, 9}},
+            {"strides", std::vector<int64_t>{2, 2, 2}}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, slice_assign_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape_0{6, 8, 10};
+  std::vector<int64_t> shape_1{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape_0);
+  NewVar(block, x1, shape_1);
+  AppendOp(block, "slice_assign_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}},
+           {{"axis", std::vector<int64_t>{0, 1, 2}},
+            {"starts", std::vector<int64_t>{0, 0, 0}},
+            {"ends", std::vector<int64_t>{5, 7, 9}},
+            {"strides", std::vector<int64_t>{2, 2, 2}}});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 8L);
+  ASSERT_EQ(shapes[2], 10L);
+}
+
+TEST(PrimOp, gather_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{6, 8, 10};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "gather_p", {{"X", {x0}}}, {{"Y", {x1}}},
+           {{"axis", int64_t{1}}, {"index", std::vector<int64_t>{0, 2, 5}}});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 3L);
+  ASSERT_EQ(shapes[2], 10L);
+  std::string index_t = "index_t";
+  std::string x2 = "x2";
+
+  auto *var_desc = block->Var(index_t);
+  var_desc->SetShape(std::vector<int64_t>{3});
+  var_desc->SetType(proto::VarType::LOD_TENSOR);
+  var_desc->SetDataType(proto::VarType_Type_INT32);
+  AppendOp(block, "gather_p", {{"X", {x0}}, {"IndexTensor", {index_t}}},
+           {{"Y", {x2}}}, {{"axis", int64_t{1}}});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 3L);
+  ASSERT_EQ(shapes[2], 10L);
+}
+
+TEST(PrimOp, scatter_add_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape_0{6, 8, 10};
+  std::vector<int64_t> shape_1{6, 3, 10};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape_0);
+  NewVar(block, x1, shape_1);
+  AppendOp(block, "scatter_add_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}},
+           {{"axis", int64_t{1}}, {"index", std::vector<int64_t>{0, 2, 5}}});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 8L);
+  ASSERT_EQ(shapes[2], 10L);
+  std::string index_t = "index_t";
+  std::string x3 = "x3";
+
+  auto *var_desc = block->Var(index_t);
+  var_desc->SetShape(std::vector<int64_t>{3});
+  var_desc->SetType(proto::VarType::LOD_TENSOR);
+  var_desc->SetDataType(proto::VarType_Type_INT32);
+  AppendOp(block, "scatter_add_p",
+           {{"X", {x0}}, {"Y", {x1}}, {"IndexTensor", {index_t}}},
+           {{"Z", {x3}}}, {{"axis", int64_t{1}}});
+  ASSERT_EQ(block->Var("x3")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x3")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x3")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 6L);
+  ASSERT_EQ(shapes[1], 8L);
+  ASSERT_EQ(shapes[2], 10L);
+}
+
+TEST(PrimOp, add_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape);
+  NewVar(block, x1, shape);
+  AppendOp(block, "add_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, sub_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape);
+  NewVar(block, x1, shape);
+  AppendOp(block, "sub_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, mul_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape);
+  NewVar(block, x1, shape);
+  AppendOp(block, "mul_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, div_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape);
+  NewVar(block, x1, shape);
+  AppendOp(block, "div_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, sqrt_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "sqrt_p", {{"X", {x0}}}, {{"Y", {x1}}}, {});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, tanh_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape{3, 4, 5};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+
+  NewVar(block, x0, shape);
+  AppendOp(block, "tanh_p", {{"X", {x0}}}, {{"Y", {x1}}}, {});
+  ASSERT_EQ(block->Var("x1")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x1")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x1")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+TEST(PrimOp, matmul_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::vector<int64_t> shape_0{3, 4, 5};
+  std::vector<int64_t> shape_1{3, 5, 8};
+
+  std::string x0 = "x0";
+  std::string x1 = "x1";
+  std::string x2 = "x2";
+
+  NewVar(block, x0, shape_0);
+  NewVar(block, x1, shape_1);
+  AppendOp(block, "matmul_p", {{"X", {x0}}, {"Y", {x1}}}, {{"Z", {x2}}}, {});
+  ASSERT_EQ(block->Var("x2")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x2")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x2")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 8L);
+  std::vector<int64_t> shape_2{4, 5};
+  std::vector<int64_t> shape_3{5, 8};
+
+  std::string x3 = "x3";
+  std::string x4 = "x4";
+  std::string x5 = "x5";
+
+  NewVar(block, x3, shape_2);
+  NewVar(block, x4, shape_3);
+  AppendOp(block, "matmul_p", {{"X", {x3}}, {"Y", {x4}}}, {{"Z", {x5}}}, {});
+  ASSERT_EQ(block->Var("x5")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x5")->GetDataType(), proto::VarType_Type_FP32);
+  shapes = block->Var("x5")->GetShape();
+  ASSERT_EQ(shapes.size(), 2UL);
+  ASSERT_EQ(shapes[0], 4L);
+  ASSERT_EQ(shapes[1], 8L);
+}
+
+TEST(PrimOp, fill_constant_p) {
+  ProgramDesc program;
+  auto *block = program.MutableBlock(0);
+  std::string x0 = "x0";
+
+  AppendOp(block, "fill_constant_p", {{}}, {{"Y", {x0}}},
+           {{"value", 0.0f},
+            {"dtype", proto::VarType_Type_FP32},
+            {"shape", std::vector<int64_t>{3, 4, 5}}});
+  ASSERT_EQ(block->Var("x0")->GetType(), proto::VarType::LOD_TENSOR);
+  ASSERT_EQ(block->Var("x0")->GetDataType(), proto::VarType_Type_FP32);
+  auto shapes = block->Var("x0")->GetShape();
+  ASSERT_EQ(shapes.size(), 3UL);
+  ASSERT_EQ(shapes[0], 3L);
+  ASSERT_EQ(shapes[1], 4L);
+  ASSERT_EQ(shapes[2], 5L);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/prim_ops/reduce_p_op.cc b/paddle/fluid/operators/prim_ops/reduce_p_op.cc
new file mode 100644
index 0000000000000..9f2b5f3ed2c43
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/reduce_p_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class ReducePrimOp : public framework::OperatorBase {
+ public:
+  ReducePrimOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator reduce_p should not be excuted directly"));
+  }
+};
+
+class ReducePrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of reduce_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of reduce_p op.");
+    AddAttr<std::vector<int64_t>>(
+        "axis",
+        "(std::vector<int64_t>) The axis along which to reduce on. Must be in "
+        "range [-rank(input), rank(input)]. If `axis[i] < 0`, the axis[i] to "
+        "reduce is `rank + axis[i]`.");
+    AddAttr<bool>("keepdim",
+                  "(bool, default false) "
+                  "If true, retain the reduced axis with length 1.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Autograd primitive reduce_p operator.
+)DOC");
+  }
+};
+
+class ReducePrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
+    auto keepdim = ctx->Attrs().Get<bool>("keepdim");
+    if (keepdim) {
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_shape[axis[i]] = 1;
+      }
+    } else {
+      const int kDelFlag = -2;
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_shape[axis[i]] = kDelFlag;
+      }
+      x_shape.erase(remove(x_shape.begin(), x_shape.end(), kDelFlag),
+                    x_shape.end());
+    }
+    if (!keepdim && x_shape.size() == 0) {
+      x_shape.push_back(1);
+    }
+
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class ReducePrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(reduce_p, paddle::operators::ReducePrimOp,
+                  paddle::operators::ReducePrimOpMaker,
+                  paddle::operators::ReducePrimOpShapeInference,
+                  paddle::operators::ReducePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/reshape_p_op.cc b/paddle/fluid/operators/prim_ops/reshape_p_op.cc
new file mode 100644
index 0000000000000..497bc8fbaffb3
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/reshape_p_op.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class ReshapePrimOp : public framework::OperatorBase {
+ public:
+  ReshapePrimOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator reshape_p should not be excuted directly"));
+  }
+};
+
+class ReshapePrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of reshape_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of reshape_p op.");
+    AddAttr<std::vector<int64_t>>(
+        "shape", "(std::vector<int64_t>) Target shape of reshape_p operator.");
+    AddComment(R"DOC(
+Autograd primitive reshape_p operator.
+)DOC");
+  }
+};
+
+static int64_t product(const std::vector<int64_t> &shape) {
+  int64_t rslt = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    rslt *= shape[i];
+  }
+  return rslt;
+}
+
+class ReshapePrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
+    PADDLE_ENFORCE_EQ(product(x_shape), product(shape),
+                      platform::errors::InvalidArgument(
+                          "The input tensor can't be reshaped to target shape, "
+                          "the input tensor has %d elements but target shape "
+                          "contains %d elements",
+                          product(x_shape), product(shape)));
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(shape);
+  }
+};
+
+class ReshapePrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(reshape_p, paddle::operators::ReshapePrimOp,
+                  paddle::operators::ReshapePrimOpMaker,
+                  paddle::operators::ReshapePrimOpShapeInference,
+                  paddle::operators::ReshapePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc b/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc
new file mode 100644
index 0000000000000..420e6907e193d
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/scatter_add_p_op.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class ScatterAddPrimOp : public framework::OperatorBase {
+ public:
+  ScatterAddPrimOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator scatter_add_p should not be excuted directly"));
+  }
+};
+
+class ScatterAddPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The tensor to apply scatter rule and add on.");
+    AddInput("Y", "(Tensor), The source tensor of scatter_add_p op.");
+    AddInput(
+        "IndexTensor",
+        "(Tensor), The index tensor of scatter_add_p op, which is a 1D tensor.")
+        .AsDispensable();
+    AddOutput("Z", "(Tensor), The output tensor of scatter_add_p op.");
+    AddAttr<int64_t>("axis",
+                     "(int64_t), The axis along which to scatter and add.");
+    AddAttr<std::vector<int64_t>>(
+        "index", "(std::vector<int64_t>) The index of scatter_add_p op")
+        .SetDefault({0});
+    AddComment(R"DOC(
+Autograd primitive scatter_add_p operator.
+)DOC");
+  }
+};
+
+class ScatterAddPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+    int64_t num_index = 0;
+    if (ctx->HasInput("IndexTensor")) {
+      framework::InferShapeVarPtr index_var_ptr =
+          ctx->GetInputVarPtrs("IndexTensor")[0];
+      framework::VarDesc *index_var =
+          BOOST_GET(framework::VarDesc *, index_var_ptr);
+      auto index_shape = index_var->GetShape();
+      PADDLE_ENFORCE_EQ(index_shape.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "The index tensor should be a 1D tensor,"
+                            "but get rank %d",
+                            index_shape.size()));
+      num_index = index_shape[0];
+    } else {
+      num_index = ctx->Attrs().Get<std::vector<int64_t>>("index").size();
+    }
+    auto axis = ctx->Attrs().Get<int64_t>("axis");
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank, y_rank));
+    PADDLE_ENFORCE_EQ(y_shape[axis], num_index,
+                      platform::errors::InvalidArgument(
+                          "The shape of source input tensor at scatter axis "
+                          "should be  equal to num_index, "
+                          "but get %d and %d",
+                          y_shape[axis], num_index));
+    for (size_t i = 0; i < x_rank; ++i) {
+      if (i != size_t(axis)) {
+        PADDLE_ENFORCE_EQ(
+            x_shape[i], y_shape[i],
+            platform::errors::InvalidArgument(
+                "The shape of two input tensor at dimension %d should be same, "
+                "but get %d and %d",
+                i, x_rank, y_rank));
+      }
+    }
+
+    BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class ScatterAddPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    if (ctx->HasInput("IndexTensor")) {
+      auto index_name = Input(ctx, "IndexTensor")[0];
+      auto index_dtype = GetDataType(ctx, index_name);
+      PADDLE_ENFORCE_EQ(
+          index_dtype, framework::proto::VarType_Type_INT32,
+          platform::errors::InvalidArgument(
+              "The datatype of input tensor should be VarType_Type_INT32(%d), "
+              "but get %d",
+              framework::proto::VarType_Type_INT32, index_dtype));
+    }
+    SetType(ctx, z_name, GetType(ctx, x_name));
+    SetDataType(ctx, z_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(scatter_add_p, paddle::operators::ScatterAddPrimOp,
+                  paddle::operators::ScatterAddPrimOpMaker,
+                  paddle::operators::ScatterAddPrimOpShapeInference,
+                  paddle::operators::ScatterAddPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc b/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc
new file mode 100644
index 0000000000000..6fff54cced550
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/slice_assign_p_op.cc
@@ -0,0 +1,152 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class SliceAssignPrimOp : public framework::OperatorBase {
+ public:
+  SliceAssignPrimOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator slice_assign_p should not be excuted directly"));
+  }
+};
+
+class SliceAssignPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The tensor to slice from and assign on.");
+    AddInput("Y", "(Tensor), The source tensor of slice_assign_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of slice_assign_p op.");
+    AddAttr<std::vector<int64_t>>(
+        "axis", "(std::vector<int64_t>), The axis along which to gather.");
+    AddAttr<std::vector<int64_t>>(
+        "starts",
+        "(std::vector<int64_t>) The slice starts of slice_assign_p op");
+    AddAttr<std::vector<int64_t>>(
+        "ends", "(std::vector<int64_t>) The slice ends of slice_assign_p op");
+    AddAttr<std::vector<int64_t>>(
+        "strides",
+        "(std::vector<int64_t>) The slice strides of slice_assign_p op");
+    AddComment(R"DOC(
+Autograd primitive slice_assign_p operator.
+)DOC");
+  }
+};
+
+class SliceAssignPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
+    auto starts = ctx->Attrs().Get<std::vector<int64_t>>("starts");
+    auto ends = ctx->Attrs().Get<std::vector<int64_t>>("ends");
+    auto strides = ctx->Attrs().Get<std::vector<int64_t>>("strides");
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axis.size(),
+        platform::errors::InvalidArgument(
+            "Number of starts attribute and axis attribute should be same, "
+            "but get %d and %d",
+            starts.size(), axis.size()));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axis.size(),
+        platform::errors::InvalidArgument(
+            "Number of ends attribute and axis attribute should be same, "
+            "but get %d and %d",
+            ends.size(), axis.size()));
+    PADDLE_ENFORCE_EQ(
+        strides.size(), axis.size(),
+        platform::errors::InvalidArgument(
+            "Number of strides attribute and axis attribute should be same, "
+            "but get %d and %d",
+            strides.size(), axis.size()));
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank, y_rank));
+    std::vector<int64_t> y_target_shape(x_shape);
+    for (size_t i = 0; i < axis.size(); ++i) {
+      y_target_shape[axis[i]] =
+          (ends[i] - starts[i] + strides[i] - 1) / strides[i];
+    }
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(y_target_shape[i], y_shape[i],
+                        platform::errors::InvalidArgument(
+                            "The shape of source tensor of slice_assign_p op "
+                            "at dimension %d should be %d, "
+                            "but get %d",
+                            i, y_target_shape[i], y_shape[i]));
+    }
+    BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class SliceAssignPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    SetType(ctx, z_name, GetType(ctx, x_name));
+    SetDataType(ctx, z_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(slice_assign_p, paddle::operators::SliceAssignPrimOp,
+                  paddle::operators::SliceAssignPrimOpMaker,
+                  paddle::operators::SliceAssignPrimOpShapeInference,
+                  paddle::operators::SliceAssignPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/slice_select_p_op.cc b/paddle/fluid/operators/prim_ops/slice_select_p_op.cc
new file mode 100644
index 0000000000000..9456ab403737d
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/slice_select_p_op.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class SliceSelectPrimOp : public framework::OperatorBase {
+ public:
+  SliceSelectPrimOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator slice_select_p should not be excuted directly"));
+  }
+};
+
+class SliceSelectPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of slice_select_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of slice_select_p op.");
+    AddAttr<std::vector<int64_t>>(
+        "axis", "(std::vector<int64_t>), The axis along which to gather.");
+    AddAttr<std::vector<int64_t>>(
+        "starts",
+        "(std::vector<int64_t>) The slice starts of slice_select_p op");
+    AddAttr<std::vector<int64_t>>(
+        "ends", "(std::vector<int64_t>) The slice ends of slice_select_p op");
+    AddAttr<std::vector<int64_t>>(
+        "strides",
+        "(std::vector<int64_t>) The slice strides of slice_select_p op");
+    AddComment(R"DOC(
+Autograd primitive slice_select_p operator.
+)DOC");
+  }
+};
+
+class SliceSelectPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
+    auto starts = ctx->Attrs().Get<std::vector<int64_t>>("starts");
+    auto ends = ctx->Attrs().Get<std::vector<int64_t>>("ends");
+    auto strides = ctx->Attrs().Get<std::vector<int64_t>>("strides");
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axis.size(),
+        platform::errors::InvalidArgument(
+            "Number of starts attribute and axis attribute should be same, "
+            "but get %d and %d",
+            starts.size(), axis.size()));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axis.size(),
+        platform::errors::InvalidArgument(
+            "Number of ends attribute and axis attribute should be same, "
+            "but get %d and %d",
+            ends.size(), axis.size()));
+    PADDLE_ENFORCE_EQ(
+        strides.size(), axis.size(),
+        platform::errors::InvalidArgument(
+            "Number of strides attribute and axis attribute should be same, "
+            "but get %d and %d",
+            strides.size(), axis.size()));
+    for (size_t i = 0; i < axis.size(); ++i) {
+      x_shape[axis[i]] = (ends[i] - starts[i] + strides[i] - 1) / strides[i];
+    }
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class SliceSelectPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(slice_select_p, paddle::operators::SliceSelectPrimOp,
+                  paddle::operators::SliceSelectPrimOpMaker,
+                  paddle::operators::SliceSelectPrimOpShapeInference,
+                  paddle::operators::SliceSelectPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/split_p_op.cc b/paddle/fluid/operators/prim_ops/split_p_op.cc
new file mode 100644
index 0000000000000..212692bf0355b
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/split_p_op.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class SplitPrimOp : public framework::OperatorBase {
+ public:
+  SplitPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator split_p should not be excuted directly"));
+  }
+};
+
+class SplitPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of split_p op.");
+    AddOutput("YS", "(Tensor), The output tensors of split_p op.")
+        .AsDuplicable();
+    AddAttr<int64_t>("axis", "(int64_t), The axis along which to split.");
+    AddAttr<std::vector<int64_t>>(
+        "num_or_sections",
+        "(std::vector<int64_t>) If num_or_sections has only one element, then "
+        "num_or_sections indicates the number of equal sized sub-Tensors that "
+        "the input will be divided into. If num_or_sections has more then one "
+        "element, the length of it indicates the number of sub-Tensors and the "
+        "elements in it indicate the sizes of sub-Tensors’ dimension orderly. "
+        "The length of the vector must not be larger than the input's size of "
+        "specified axis.");
+    AddComment(R"DOC(
+Autograd primitive split_p operator.
+)DOC");
+  }
+};
+
+class SplitPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    auto y_var_ptrs = ctx->GetOutputVarPtrs("YS");
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto axis = ctx->Attrs().Get<int64_t>("axis");
+    auto num_or_sections =
+        ctx->Attrs().Get<std::vector<int64_t>>("num_or_sections");
+    std::vector<int64_t> y_shape(x_shape);
+    if (num_or_sections.size() == 1) {
+      PADDLE_ENFORCE_EQ(x_shape[axis] % num_or_sections[0], 0,
+                        platform::errors::InvalidArgument(
+                            "The input tensor can't be devided equally into %d "
+                            "parts equally along axis %d",
+                            num_or_sections[0], axis));
+      y_shape[axis] = x_shape[axis] / num_or_sections[0];
+      for (size_t i = 0; i < size_t(num_or_sections[0]); ++i) {
+        BOOST_GET(framework::VarDesc *, y_var_ptrs[i])->SetShape(y_shape);
+      }
+    } else {
+      int64_t cnt_along_axis = 0;
+      for (size_t i = 0; i < num_or_sections.size(); ++i) {
+        y_shape[axis] = num_or_sections[i];
+        cnt_along_axis += num_or_sections[i];
+        BOOST_GET(framework::VarDesc *, y_var_ptrs[i])->SetShape(y_shape);
+      }
+      PADDLE_ENFORCE_EQ(
+          x_shape[axis], cnt_along_axis,
+          platform::errors::InvalidArgument(
+              "The input tensor has %d elements along axis %d, thus can't be "
+              "devided into %d tensor with %d elements totally.",
+              x_shape[axis], axis, num_or_sections.size(), cnt_along_axis));
+    }
+  }
+};
+
+class SplitPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_names = Output(ctx, "YS");
+    for (auto y_name : y_names) {
+      SetType(ctx, y_name, GetType(ctx, x_name));
+      SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(split_p, paddle::operators::SplitPrimOp,
+                  paddle::operators::SplitPrimOpMaker,
+                  paddle::operators::SplitPrimOpShapeInference,
+                  paddle::operators::SplitPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/sqrt_p_op.cc b/paddle/fluid/operators/prim_ops/sqrt_p_op.cc
new file mode 100644
index 0000000000000..de4958d29f933
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/sqrt_p_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class SqrtPrimOp : public framework::OperatorBase {
+ public:
+  SqrtPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator sqrt_p should not be excuted directly"));
+  }
+};
+
+class SqrtPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of sqrt_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of sqrt_p op.");
+    AddComment(R"DOC(
+Autograd primitive sqrt_p operator.
+)DOC");
+  }
+};
+
+class SqrtPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
+  }
+};
+
+class SqrtPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(sqrt_p, paddle::operators::SqrtPrimOp,
+                  paddle::operators::SqrtPrimOpMaker,
+                  paddle::operators::SqrtPrimOpShapeInference,
+                  paddle::operators::SqrtPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/sub_p_op.cc b/paddle/fluid/operators/prim_ops/sub_p_op.cc
new file mode 100644
index 0000000000000..f689f2d2d918b
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/sub_p_op.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class SubPrimOp : public framework::OperatorBase {
+ public:
+  SubPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator sub_p should not be excuted directly"));
+  }
+};
+
+class SubPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of sub_p op.");
+    AddInput("Y", "(Tensor), The input tensor of sub_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of sub_p op.");
+    AddComment(R"DOC(
+Autograd primitive sub_p operator.
+)DOC");
+  }
+};
+
+class SubPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = BOOST_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank, y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank, y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i], y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i, x_shape[i], y_shape[i]));
+    }
+
+    BOOST_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class SubPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type, y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type, y_type));
+    PADDLE_ENFORCE_EQ(x_dtype, y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype, y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, x_dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(sub_p, paddle::operators::SubPrimOp,
+                  paddle::operators::SubPrimOpMaker,
+                  paddle::operators::SubPrimOpShapeInference,
+                  paddle::operators::SubPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/tanh_p_op.cc b/paddle/fluid/operators/prim_ops/tanh_p_op.cc
new file mode 100644
index 0000000000000..c2afdcbe4b207
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/tanh_p_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class TanhPrimOp : public framework::OperatorBase {
+ public:
+  TanhPrimOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator tanh_p should not be excuted directly"));
+  }
+};
+
+class TanhPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of tanh_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of tanh_p op.");
+    AddComment(R"DOC(
+Autograd primitive tanh_p operator.
+)DOC");
+  }
+};
+
+class TanhPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
+  }
+};
+
+class TanhPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(tanh_p, paddle::operators::TanhPrimOp,
+                  paddle::operators::TanhPrimOpMaker,
+                  paddle::operators::TanhPrimOpShapeInference,
+                  paddle::operators::TanhPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/transpose_p_op.cc b/paddle/fluid/operators/prim_ops/transpose_p_op.cc
new file mode 100644
index 0000000000000..b3b72318cd51d
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/transpose_p_op.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class TransposePrimOp : public framework::OperatorBase {
+ public:
+  TransposePrimOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator transpose_p should not be excuted directly"));
+  }
+};
+
+class TransposePrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of transpose_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of transpose_p op.");
+    AddAttr<std::vector<int64_t>>("axis",
+                                  "(std::vector<int64_t>) Tanspose axis.");
+    AddComment(R"DOC(
+Autograd primitive transpose_p operator.
+)DOC");
+  }
+};
+
+class TransposePrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = BOOST_GET(framework::VarDesc *, x_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto axis = ctx->Attrs().Get<std::vector<int64_t>>("axis");
+    size_t x_rank = x_shape.size();
+    size_t axis_size = axis.size();
+    PADDLE_ENFORCE_EQ(x_rank, axis_size,
+                      platform::errors::InvalidArgument(
+                          "The input tensor's dimension "
+                          "should be equal to the axis's size. "
+                          "But received input tensor's dimension is %d, "
+                          "axis's size is %d",
+                          x_rank, axis_size));
+
+    std::vector<int> count(axis_size, 0);
+    for (size_t i = 0; i < axis_size; i++) {
+      PADDLE_ENFORCE_GE(axis[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The axis should be greater than or equal to 0."
+                            "But received %d of axis[%d]",
+                            axis[i], i));
+
+      PADDLE_ENFORCE_EQ(
+          axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1, true,
+          platform::errors::InvalidArgument(
+              "Each element of Attribute axis should "
+              "be a unique value range from 0 to (dims - 1), "
+              "where the dims is the axis's size, "
+              "unique value means this axis value can appear only once. "
+              "But received axis[%d] is %d, axis_size is %d, "
+              "count[axis[%d]] is %d",
+              i, axis[i], axis_size, i, count[axis[i]]));
+    }
+    std::vector<int64_t> y_shape(axis_size);
+    for (size_t i = 0; i < axis_size; i++) {
+      y_shape[i] = x_shape[axis[i]];
+    }
+    BOOST_GET(framework::VarDesc *, y_var_ptr)->SetShape(y_shape);
+  }
+};
+
+class TransposePrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(transpose_p, paddle::operators::TransposePrimOp,
+                  paddle::operators::TransposePrimOpMaker,
+                  paddle::operators::TransposePrimOpShapeInference,
+                  paddle::operators::TransposePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/unity_build_rule.cmake b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
new file mode 100644
index 0000000000000..5d6a732272b9b
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/unity_build_rule.cmake
@@ -0,0 +1,20 @@
+register_unity_group(cc
+    reshape_p_op.cc
+    broadcast_p_op.cc
+    reduce_p_op.cc
+    transpose_p_op.cc
+    split_p_op.cc
+    concat_p_op.cc
+    slice_select_p_op.cc
+    slice_assign_p_op.cc
+    gather_p_op.cc
+    scatter_add_p_op.cc
+    add_p_op.cc
+    sub_p_op.cc
+    mul_p_op.cc
+    div_p_op.cc
+    sqrt_p_op.cc
+    tanh_p_op.cc
+    matmul_p_op.cc
+    fill_constant_p_op.cc
+    )

From c7623d72de13bf167559c7f4e68520244911ff25 Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 14 Apr 2022 03:20:28 +0200
Subject: [PATCH 08/19] Added shuffle_channel BF16/FP32 FWD oneDNN kernel
 (#39756)

* added shuffle_channel bf16/fp32 fwd kernel

* added missing files

* CI fix

* changed from pten to phi

* tmp save

* added reviewers suggestions

* fix for test
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../shuffle_channel_mkldnn_detect_pass.cc     | 237 ++++++++++++++++++
 .../shuffle_channel_mkldnn_detect_pass.h      |  38 +++
 .../inference/api/paddle_pass_builder.cc      |   1 +
 .../mkldnn/shuffle_channel_mkldnn_op.cc       |  77 ++++++
 paddle/fluid/operators/shuffle_channel_op.cc  |  18 +-
 .../test_mkldnn_shuffle_channel_op.py         |  61 +++++
 .../mkldnn/test_shuffle_channel_mkldnn_op.py  |  62 +++++
 8 files changed, 492 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h
 create mode 100644 paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 834a2c953eab8..48ccadd037363 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -128,6 +128,7 @@ if(WITH_MKLDNN)
     pass_library(fc_mkldnn_pass inference DIR mkldnn)
     pass_library(interpolate_mkldnn_pass inference DIR mkldnn)
     pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
     pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
new file mode 100644
index 0000000000000..bf603dc4bbcb9
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES             \
+  GET_IR_NODE(reshape1_op);   \
+  GET_IR_NODE(reshape1_out);  \
+  GET_IR_NODE(transpose_op);  \
+  GET_IR_NODE(transpose_out); \
+  GET_IR_NODE(reshape2_op);   \
+  GET_IR_NODE(reshape2_out);
+
+ShuffleChannelMKLDNNDetectPass::ShuffleChannelMKLDNNDetectPass() {
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("shape")
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsType<std::vector<int>>()
+      .End();
+}
+
+void ShuffleChannelMKLDNNDetectPass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "shufflechannel_pattern";
+  FusePassBase::Init(pattern_name, graph);
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("reshape2", "X")
+                ->AsInput();
+
+  patterns::ShuffleChannelPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "The Pass in op compat failed.";
+      return;
+    }
+    PADDLE_ENFORCE_GT(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input X."));
+    auto* input_node = subgraph.at(x);
+    auto reshape1_desc = reshape1_op->Op();
+    auto reshape2_desc = reshape2_op->Op();
+    auto trans_desc = transpose_op->Op();
+    std::string input_name = input_node->Name();
+    std::string output_name = reshape2_out->Name();
+
+    auto reshape1_shape =
+        BOOST_GET_CONST(std::vector<int>, reshape1_desc->GetAttr("shape"));
+    auto reshape2_shape =
+        BOOST_GET_CONST(std::vector<int>, reshape2_desc->GetAttr("shape"));
+    auto trans_axis =
+        BOOST_GET_CONST(std::vector<int>, trans_desc->GetAttr("axis"));
+    auto* block1 = reshape1_desc->Block();
+    auto* block2 = reshape2_desc->Block();
+    if (block1 && block2) {
+      auto x_var_name = reshape1_desc->Input("X")[0];
+      auto* x_var_desc = block1->FindVar(x_var_name);
+      auto x_shape1 = x_var_desc->GetShape();
+      x_var_name = reshape2_desc->Input("X")[0];
+      x_var_desc = block2->FindVar(x_var_name);
+      auto x_shape2 = x_var_desc->GetShape();
+      // now shuffle_channel is 4D(NCHW) only.
+      if (x_shape1.size() != 4 || reshape1_shape.size() != 5 ||
+          reshape2_shape.size() != 4 || trans_axis.size() != 5) {
+        return;
+      }
+
+      // process 0 and -1 in reshape.
+      constexpr int64_t copy_dim_val = 0;
+      for (size_t i = 0; i < reshape1_shape.size(); i++) {
+        if (reshape1_shape[i] == copy_dim_val) {
+          reshape1_shape[i] = x_shape1[i];
+        }
+      }
+      for (size_t i = 0; i < reshape2_shape.size(); i++) {
+        if (reshape2_shape[i] == copy_dim_val) {
+          reshape2_shape[i] = x_shape2[i];
+        }
+      }
+      constexpr int64_t unk_dim_idx = -1;
+      bool all_positive = std::all_of(x_shape1.cbegin(), x_shape1.cend(),
+                                      [](int64_t i) { return i > 0; });
+      for (size_t i = 0; i < reshape1_shape.size(); ++i) {
+        // if -1 is not in batch dim, try to calculate number
+        if ((reshape1_shape[i] == unk_dim_idx) && (i != 0)) {
+          // there is no sufficient info
+          if (!all_positive) return;
+          reshape1_shape[i] =
+              std::accumulate(x_shape1.begin(), x_shape1.end(),
+                              static_cast<int64_t>(1),
+                              std::multiplies<int64_t>()) /
+              std::accumulate(reshape1_shape.begin(), reshape1_shape.end(),
+                              static_cast<int64_t>(-1),
+                              std::multiplies<int64_t>());
+          break;
+        }
+      }
+
+      all_positive = std::all_of(x_shape2.cbegin(), x_shape2.cend(),
+                                 [](int64_t i) { return i > 0; });
+      for (size_t i = 0; i < reshape2_shape.size(); ++i) {
+        // if -1 is not in batch dim, try to calculate number
+        if ((reshape2_shape[i] == unk_dim_idx) && (i != 0)) {
+          // there is no sufficient info
+          if (!all_positive) return;
+          reshape2_shape[i] =
+              std::accumulate(x_shape2.begin(), x_shape2.end(),
+                              static_cast<int64_t>(1),
+                              std::multiplies<int64_t>()) /
+              std::accumulate(reshape2_shape.begin(), reshape2_shape.end(),
+                              static_cast<int64_t>(-1),
+                              std::multiplies<int64_t>());
+          break;
+        }
+      }
+
+      // shuffle_channel dosen't change shape
+      if ((reshape2_shape[0] != -1) && (x_shape1[0] != reshape2_shape[0])) {
+        return;
+      }
+      for (size_t i = 1; i < x_shape1.size(); i++) {
+        if (x_shape1[i] != reshape2_shape[i]) {
+          return;
+        }
+      }
+      if ((reshape2_shape[3] != reshape1_shape[4]) ||
+          (reshape2_shape[2] != reshape1_shape[3])) {
+        return;
+      }
+    } else {
+      return;  // conservative judgement
+    }
+
+    int i_c = reshape1_shape[2];
+    int o_c = reshape2_shape[1];
+    int group = o_c / i_c;
+    // should split on channel dim
+    if (reshape2_shape[1] != reshape1_shape[2] * reshape1_shape[1]) return;
+    // trans on channel dim
+    if (trans_axis[0] != 0 || trans_axis[3] != 3 || trans_axis[4] != 4) return;
+    if (group != 1 && i_c != 1) {
+      if (trans_axis[1] != 2 && trans_axis[2] != 1) {
+        return;
+      }
+    }
+
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("shuffle_channel");
+    new_op_desc.SetInput("X", {input_name});
+    new_op_desc.SetOutput("Out", {output_name});
+
+    new_op_desc.SetAttr("group", group);
+    new_op_desc.SetAttr("use_mkldnn", true);
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto* new_op = graph->CreateOpNode(&new_op_desc);
+
+    IR_NODE_LINK_TO(input_node, new_op);
+    IR_NODE_LINK_TO(new_op, reshape2_out);
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op,
+                                 transpose_out, reshape2_op});
+    LOG_FIRST_N(WARNING, 1)
+        << "There is fluid.layers.shuffle_channel API already, maybe you can "
+           "use it instead of (reshape + transpose + reshape)";
+  };
+
+  gpd(graph, handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(shuffle_channel_mkldnn_detect_pass,
+              paddle::framework::ir::ShuffleChannelMKLDNNDetectPass);
+REGISTER_PASS_CAPABILITY(shuffle_channel_mkldnn_detect_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h
new file mode 100644
index 0000000000000..231b63c3b6a00
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/shuffle_channel_mkldnn_detect_pass.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+class ShuffleChannelMKLDNNDetectPass : public FusePassBase {
+ public:
+  ShuffleChannelMKLDNNDetectPass();
+  virtual ~ShuffleChannelMKLDNNDetectPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 20418e37a7b94..d0fe3953d00d6 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -298,6 +298,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              //  "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
+             "shuffle_channel_mkldnn_detect_pass",    //
              "elt_act_mkldnn_fuse_pass",              //
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
diff --git a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
new file mode 100644
index 0000000000000..408de57bf946d
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::MKLDNNGetDataType;
+template <typename T>
+class ShuffleChannelMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::shuffle_forward> {
+ public:
+  ShuffleChannelMKLDNNHandler(const Tensor* x, const int group,
+                              const dnnl::engine engine,
+                              platform::Place cpu_place)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::shuffle_forward>(engine,
+                                                                    cpu_place) {
+    static constexpr int channel_axis = 1;
+    const auto md = dnnl::memory::desc(phi::vectorize(x->dims()),
+                                       MKLDNNGetDataType<T>(), x->format());
+
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
+                                            md, channel_axis, group);
+  }
+};
+
+template <typename T>
+class ShuffleChannelMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    const auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    // oneDNN handles group using C/g instead of g
+    const int group = x->dims()[1] / ctx.Attr<int>("group");
+
+    ShuffleChannelMKLDNNHandler<T> handler(x, group, mkldnn_engine,
+                                           ctx.GetPlace());
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto shuffle_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    shuffle_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p},
+                                 {DNNL_ARG_DST, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(x->format());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(shuffle_channel, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ShuffleChannelMKLDNNKernel<float>,
+                   ops::ShuffleChannelMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc
index 119d2e7236946..70fddc9b04712 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
@@ -35,9 +35,17 @@ class ShuffleChannelOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    auto input_data_type =
+        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -56,6 +64,10 @@ class ShuffleChannelOpMaker : public framework::OpProtoAndCheckerMaker {
           PADDLE_ENFORCE_GE(group, 1, platform::errors::InvalidArgument(
                                           "group should be larger than 0."));
         });
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false)
+        .AsExtra();
 
     AddComment(R"DOC(
 		Shuffle Channel operator
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py
new file mode 100644
index 0000000000000..26655970290cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_shuffle_channel_op.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import MkldnnAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+from functools import partial
+import unittest
+from hypothesis import given
+import hypothesis.strategies as st
+
+
+class TestMKLDNNShuffleChannelOp(MkldnnAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self, *args, **kwargs):
+        def generate_input(*args, **kwargs):
+            return np.random.random(kwargs['in_shape']).astype(np.float32)
+
+        shuffle_channel_op = OpConfig(
+            type="shuffle_channel",
+            inputs={"X": ["input_data"]},
+            outputs={"Out": ["output_data"]},
+            attrs={"group": kwargs['group']})
+
+        program_config = ProgramConfig(
+            ops=[shuffle_channel_op],
+            weights={},
+            inputs={
+                "input_data": TensorConfig(data_gen=partial(generate_input,
+                                                            *args, **kwargs)),
+            },
+            outputs=["output_data"])
+
+        yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_mkldnn=True)
+        yield config, (1e-5, 1e-5)
+
+    @given(
+        group=st.sampled_from([1, 2, 8, 32, 128]),
+        in_shape=st.sampled_from([[5, 512, 2, 3], [2, 256, 5, 4]]))
+    def test(self, *args, **kwargs):
+        self.run_test(quant=False, *args, **kwargs)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py
new file mode 100644
index 0000000000000..1d657817503de
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_shuffle_channel_mkldnn_op.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestShuffleChannelOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "shuffle_channel"
+        self.set_dtype()
+        self.set_group()
+        self.inputs = {'X': np.random.random((5, 64, 2, 3)).astype(self.dtype)}
+        self.attrs = {'use_mkldnn': True, 'group': self.group}
+
+        _, c, h, w = self.inputs['X'].shape
+        input_reshaped = np.reshape(self.inputs['X'],
+                                    (-1, self.group, c // self.group, h, w))
+        input_transposed = np.transpose(input_reshaped, (0, 2, 1, 3, 4))
+        self.outputs = {'Out': np.reshape(input_transposed, (-1, c, h, w))}
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_group(self):
+        self.group = 4
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
+class TestShuffleChannelSingleGroupOneDNNOp(TestShuffleChannelOneDNNOp):
+    def set_group(self):
+        self.group = 1
+
+
+class TestShuffleChannelBF16OneDNNOp(TestShuffleChannelOneDNNOp):
+    def set_dtype(self):
+        self.dtype = np.uint16
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()

From 8e2d4d3077b879833447ebb388552721930d4afb Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Thu, 14 Apr 2022 10:46:03 +0800
Subject: [PATCH 09/19] add mkldnn int8 pass [step3] (#41599)

* add mkldnn int8 pass [step3]

* Add test for compute_propagate_scales_mkldnn_pass

* update pass

* update api comment and python api

Co-authored-by: wozna <joanna.wozna@intel.com>
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 ...ute_propagate_scales_mkldnn_pass_tester.cc | 276 ++++++++++++++++++
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |  67 ++++-
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |   6 +
 .../ir/mkldnn/quant_dequant_mkldnn_pass.cc    |   4 -
 paddle/fluid/inference/analysis/argument.h    |   2 +
 .../inference/analysis/ir_pass_manager.cc     |   4 +
 paddle/fluid/inference/api/analysis_config.cc |  49 ++++
 .../fluid/inference/api/analysis_predictor.cc |   7 +
 .../inference/api/paddle_analysis_config.h    |  34 +++
 .../inference/api/paddle_pass_builder.cc      |  73 +++++
 .../fluid/inference/api/paddle_pass_builder.h |  11 +
 .../fluid/inference/tests/api/CMakeLists.txt  |  20 +-
 ...lyzer_quant_image_classification_tester.cc |   5 +-
 .../fluid/inference/tests/api/tester_helper.h |   1 +
 paddle/fluid/pybind/inference_api.cc          |   4 +
 16 files changed, 542 insertions(+), 22 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 48ccadd037363..e8696a3c2276b 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -218,6 +218,7 @@ endif()
     cc_test(test_scale_matmul_fuse_pass SRCS mkldnn/scale_matmul_fuse_pass_tester.cc DEPS scale_matmul_fuse_pass)
     cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
     cc_test(test_mkldnn_inplace_pass SRCS mkldnn/mkldnn_inplace_pass_tester.cc DEPS mkldnn_inplace_pass)
+    cc_test(test_compute_propagate_scales_mkldnn_pass SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc DEPS compute_propagate_scales_mkldnn_pass naive_executor)
     cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
     cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
     cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
new file mode 100644
index 0000000000000..8a7fa1f51c7c7
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
@@ -0,0 +1,276 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+const std::array<float, 10> positive_and_negative_values = {
+    -0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586,
+    -0.0495346, 0.0629528,  -0.00531285, -0.0230353,  0.0269089};
+
+const std::vector<std::vector<float>> wx = {
+    {0.04347931, -0.5643393, 0.7551297, 0.26713502, 0.8055306, 0.91144973},
+    {0.01707571, 0.12741385, 0.15419468, 0.66127586, 0.46821925, 0.9665961},
+    {0.40393898, 0.884427, -0.5853097, 0.5840954, 0.9170512, 0.98245513}};
+const std::vector<std::vector<float>> wh = {
+    {0.42484227, -0.9025513, 0.17087583, 0.8403284, 0.03325734, 0.92331886},
+    {0.32630175, 0.41691914, 0.99848574, 0.3504407, 0.06707559, 0.62239844}};
+
+const std::vector<double> gru_scales = {2.35381475, 1.08304947, 1.32427582,
+                                        1.19001095, 1.00151656, 1.01785819};
+
+const std::vector<double> lstm_scales = {2.35381475, 1.10797026, 1.00151656,
+                                         1.19001095, 1.09045166, 1.01785819};
+
+static const std::initializer_list<std::string> conv_variable_names{
+    "conv_in", "filter", "bias", "conv_out"};
+
+static const std::initializer_list<std::string> rnn_variable_names{
+    "x", "wx", "wh", "b", "h", "c"};
+
+class ComputePropagateScalesMkldnnPassTest : public testing::Test {
+ public:
+  ComputePropagateScalesMkldnnPassTest() {
+    pass.reset(new ComputePropagateScalesMkldnnPass());
+  }
+
+  std::vector<float> GetScales(Tensor* tensor, int axis) const {
+    return pass->GetScales(tensor, axis);
+  }
+
+  void ComputeVarScales(ir::Graph* graph, Scope* scope,
+                        const std::unordered_set<std::string> ops,
+                        const std::string& weight_name, const int axis,
+                        StringPairMap* var_quant_scales) const {
+    pass->ComputeVarScales(graph, scope, ops, weight_name, axis,
+                           var_quant_scales);
+  }
+
+  void ComputeGruWeightScales(ir::Graph* graph, Scope* scope,
+                              const std::string& wx_name,
+                              const std::string& wh_name,
+                              StringPairMap* var_quant_scales) const {
+    pass->ComputeGruWeightScales(graph, scope, wx_name, wh_name,
+                                 var_quant_scales);
+  }
+
+  void ComputeLstmWeightScales(ir::Graph* graph, Scope* scope,
+                               std::string wx_name, std::string wh_name,
+                               StringPairMap* var_quant_scales) const {
+    pass->ComputeLstmWeightScales(graph, scope, wx_name, wh_name,
+                                  var_quant_scales);
+  }
+
+  void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                        const std::string& var_name) {
+    auto x = scope->Var(var_name);
+    auto tensor = x->GetMutable<LoDTensor>();
+    auto tensor_size = 1;
+    if (var_name == "filter") {
+      tensor_size = positive_and_negative_values.size();
+    } else if (var_name == "wx") {
+      tensor_size = wx.size();
+    } else if (var_name == "wh") {
+      tensor_size = wh.size();
+    }
+    tensor->mutable_data(place,
+                         framework::TransToPhiDataType(proto::VarType::FP32),
+                         tensor_size);
+  }
+
+  void PrepareGraph(ir::Graph* graph, const ProgramDesc& prog, Scope* scope,
+                    const std::initializer_list<std::string>& variable_names) {
+    auto place = paddle::platform::CPUPlace();
+    NaiveExecutor exe{place};
+    exe.CreateVariables(prog, 0, true, scope);
+
+    for (auto& v : variable_names) {
+      InitTensorHolder(scope, place, v.c_str());
+    }
+    graph->SetNotOwned(kParamScopeAttr, scope);
+  }
+
+  void ComputeRnnWeightScalesTest(const std::string& type,
+                                  const std::initializer_list<std::string>& ops,
+                                  const framework::ProgramDesc& prog,
+                                  std::vector<double> scales) {
+    ir::Graph* graph(new ir::Graph(prog));
+    Scope scope;
+
+    PrepareGraph(graph, prog, &scope, rnn_variable_names);
+
+    std::string wx_name = "WeightX";
+    std::string wh_name = "WeightH";
+    std::string wx_var_names = "wx";
+    std::string wh_var_names = "wh";
+
+    StringPairMap var_quant_scales;
+
+    auto* wx_var = scope.FindVar(wx_var_names);
+    auto* wx_tensor = wx_var->GetMutable<LoDTensor>();
+    wx_tensor->Resize(phi::make_dim(wx.size(), wx[0].size()));
+    for (size_t i = 0; i < wx.size(); i++)
+      std::copy(begin(wx[i]), end(wx[i]),
+                wx_tensor->mutable_data<float>(platform::CPUPlace()) +
+                    i * wx[0].size());
+
+    auto* wh_var = scope.FindVar(wh_var_names);
+    auto* wh_tensor = wh_var->GetMutable<LoDTensor>();
+    wh_tensor->Resize(phi::make_dim(wh.size(), wh[0].size()));
+    for (size_t i = 0; i < wh.size(); i++)
+      std::copy(begin(wh[i]), end(wh[i]),
+                wh_tensor->mutable_data<float>(platform::CPUPlace()) +
+                    i * wh[0].size());
+    if (type == "gru") {
+      ComputeGruWeightScales(graph, &scope, wx_name, wh_name,
+                             &var_quant_scales);
+    } else {
+      ComputeLstmWeightScales(graph, &scope, wx_name, wh_name,
+                              &var_quant_scales);
+    }
+    bool is_unsigned;
+    framework::Tensor wx_result_tensor;
+
+    std::tie(is_unsigned, wx_result_tensor) = var_quant_scales[wx_var_names];
+    ASSERT_EQ(is_unsigned, false);
+    ASSERT_EQ(wx_result_tensor.numel(), static_cast<int64_t>(scales.size()));
+    for (int64_t i = 0; i < wx_result_tensor.numel(); i++) {
+      ASSERT_FLOAT_EQ(wx_result_tensor.data<float>()[i], scales[i]);
+    }
+  }
+
+ private:
+  std::unique_ptr<ComputePropagateScalesMkldnnPass> pass;
+};
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", true);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Output", {outputs[0]});
+  } else if (type == "fusion_gru" || type == "fusion_lstm") {
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("WeightX", {inputs[1]});
+    op->SetInput("WeightH", {inputs[2]});
+    op->SetOutput("Hidden", {outputs[0]});
+    if (type == "fusion_lstm") op->SetOutput("Cell", {outputs[1]});
+  }
+}
+
+ProgramDesc BuildConv2dProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : conv_variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "conv2d", "Conv2d", {"conv_in", "filter", "bias"}, {"conv_out"});
+
+  return prog;
+}
+
+ProgramDesc BuildFusionGruProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : rnn_variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "fusion_gru", "Fusion_gru", {"x", "wx", "wh"}, {"h"});
+
+  return prog;
+}
+
+ProgramDesc BuildFusionLstmProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : rnn_variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "fusion_lstm", "Fusion_lstm", {"x", "wx", "wh"}, {"h", "c"});
+
+  return prog;
+}
+
+TEST_F(ComputePropagateScalesMkldnnPassTest, get_scales_function) {
+  const auto& values = positive_and_negative_values;
+  float max_val = *std::max_element(values.begin(), values.end());
+
+  framework::Tensor var_tensor;
+  var_tensor.Resize(phi::make_dim(values.size(), 1));
+  std::copy(begin(values), end(values),
+            var_tensor.mutable_data<float>(platform::CPUPlace()));
+  std::vector<float> results = GetScales(&var_tensor, 0);
+
+  ASSERT_EQ(results.size(), std::size_t(1));
+  ASSERT_EQ(results[0], (1.f / max_val));
+}
+
+TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
+  auto prog = BuildConv2dProgramDesc();
+  const auto& values = positive_and_negative_values;
+  ir::Graph* graph(new ir::Graph(prog));
+  Scope scope;
+
+  PrepareGraph(graph, prog, &scope, conv_variable_names);
+
+  std::initializer_list<std::string> ops = {"conv2d", "depthwise_conv2d"};
+  std::string weight_name = "Filter";
+  std::string weight_var_name = "filter";
+
+  auto axis = 1;
+  StringPairMap var_quant_scales;
+
+  auto* var = scope.FindVar(weight_var_name);
+  auto* weight_tensor = var->GetMutable<LoDTensor>();
+  weight_tensor->Resize(phi::make_dim(1, values.size()));
+  std::copy(begin(values), end(values),
+            weight_tensor->mutable_data<float>(platform::CPUPlace()));
+
+  auto max_val = *std::max_element(values.begin(), values.end());
+
+  ComputeVarScales(graph, &scope, ops, weight_name, axis, &var_quant_scales);
+
+  bool is_unsigned;
+  framework::Tensor result_tensor;
+
+  std::tie(is_unsigned, result_tensor) = var_quant_scales[weight_var_name];
+
+  ASSERT_EQ(is_unsigned, false);
+  ASSERT_EQ(result_tensor.numel(), 1);
+  ASSERT_FLOAT_EQ(result_tensor.data<float>()[0], (1.0 / max_val));
+}
+
+TEST_F(ComputePropagateScalesMkldnnPassTest, compute_gru_weight_scales) {
+  ComputeRnnWeightScalesTest("gru", {"fusion_gru", "multi_gru"},
+                             BuildFusionGruProgramDesc(), gru_scales);
+}
+
+TEST_F(ComputePropagateScalesMkldnnPassTest, compute_lstm_weight_scales) {
+  ComputeRnnWeightScalesTest("lstm", {"fusion_lstm"},
+                             BuildFusionLstmProgramDesc(), lstm_scales);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 410dfbd680286..4aae60b853d4f 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
-
 #include <sstream>
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -226,12 +226,21 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
 
 bool CPUQuantizePass::AreScalesPresentForVarNames(
     std::vector<std::string> names) const {
-  auto& scales = Get<VarQuantScale>("quant_var_scales");
   bool present = true;
-  for (auto name : names) {
-    if (scales.find(name) == scales.end()) {
-      present = false;
-      LogScaleIsMissingForVarName(name);
+  if (var_quant_scales_->empty()) {
+    auto& scales = Get<VarQuantScale>("quant_var_scales");
+    for (auto name : names) {
+      if (scales.find(name) == scales.end()) {
+        present = false;
+        LogScaleIsMissingForVarName(name);
+      }
+    }
+  } else {
+    for (auto name : names) {
+      if (var_quant_scales_->find(name) == var_quant_scales_->end()) {
+        present = false;
+        LogScaleIsMissingForVarName(name);
+      }
     }
   }
   return present;
@@ -239,12 +248,21 @@ bool CPUQuantizePass::AreScalesPresentForVarNames(
 
 bool CPUQuantizePass::AreScalesPresentForNodes(
     std::initializer_list<Node*> nodes) const {
-  auto& scales = Get<VarQuantScale>("quant_var_scales");
   bool present = true;
-  for (auto node : nodes) {
-    if (scales.count(node->Name()) == 0) {
-      present = false;
-      LogScaleIsMissingForVarNode(node);
+  if (var_quant_scales_->empty()) {
+    auto& scales = Get<VarQuantScale>("quant_var_scales");
+    for (auto node : nodes) {
+      if (scales.count(node->Name()) == 0) {
+        present = false;
+        LogScaleIsMissingForVarNode(node);
+      }
+    }
+  } else {
+    for (auto node : nodes) {
+      if (var_quant_scales_->count(node->Name()) == 0) {
+        present = false;
+        LogScaleIsMissingForVarNode(node);
+      }
     }
   }
   return present;
@@ -252,8 +270,11 @@ bool CPUQuantizePass::AreScalesPresentForNodes(
 
 std::pair<bool, LoDTensor> CPUQuantizePass::GetScaleDataByName(
     const std::string& name) const {
-  auto& scales = Get<VarQuantScale>("quant_var_scales");
-  return scales.at(name);
+  if (var_quant_scales_->empty()) {
+    auto& scales = Get<VarQuantScale>("quant_var_scales");
+    return scales.at(name);
+  }
+  return var_quant_scales_->at(name);
 }
 
 std::pair<bool, LoDTensor> CPUQuantizePass::GetScaleDataForNode(
@@ -290,6 +311,23 @@ bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
   });
 }
 
+void CPUQuantizePass::GetQuantInfo(Graph* graph) const {
+  std::unordered_map<std::string, std::vector<float>> info_map{};
+  GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map);
+
+  for (auto iter = info_map.begin(); iter != info_map.end(); iter++) {
+    LoDTensor tensor;
+    const int size = static_cast<int>(iter->second.size());
+    auto* data = tensor.mutable_data<double>({size}, platform::CPUPlace());
+    for (int i = 0; i < size; i++) {
+      data[i] = static_cast<double>(iter->second[i]);
+    }
+
+    auto pair = std::make_pair(false, tensor);
+    var_quant_scales_->insert(std::make_pair(iter->first, pair));
+  }
+}
+
 void CPUQuantizePass::QuantizeConv(Graph* graph,
                                    bool with_residual_data) const {
   GraphPatternDetector gpd;
@@ -1138,6 +1176,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(param_scope(), platform::errors::InvalidArgument(
                                              "Scope cannot be nullptr."));
 
+  GetQuantInfo(graph);
   QuantizeConv(graph, false /* with_residual_data */);
   QuantizeConv(graph, true /* with_residual_data */);
   QuantizePool(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 3a286264e41ff..f1e2527ae6ef0 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -95,6 +95,12 @@ class CPUQuantizePass : public FusePassBase {
   bool IsOpQuantized(const Node* node) const;
 
   const std::string name_scope_{"quantize"};
+
+ private:
+  VarQuantScale string_pair_map = {};
+  VarQuantScale* const var_quant_scales_ = &string_pair_map;
+
+  void GetQuantInfo(Graph* graph) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 808d043a4b226..55470db312f81 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -200,10 +200,8 @@ void QuantDequantMkldnnPass::CollectFakeQuantizeOps(
   for (auto* node_input : op_node->inputs) {
     if (node_input->Name() == x_var_name) {
       fake_quant_in = node_input;
-      break;
     } else if (node_input->Name() == in_scale_name) {
       fake_quant_in_scale = node_input;
-      break;
     }
   }
 
@@ -212,10 +210,8 @@ void QuantDequantMkldnnPass::CollectFakeQuantizeOps(
   for (auto* node_output : op_node->outputs) {
     if (node_output->Name() == out_var_name) {
       fake_quant_out = node_output;
-      break;
     } else if (node_output->Name() == out_scale_name) {
       fake_quant_out_scale = node_output;
-      break;
     }
   }
 
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 74e8ca3f229c6..2336fd1980d2e 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -182,6 +182,8 @@ struct Argument {
   // A set of op types to enable their bfloat16 kernels
   DECL_ARGUMENT_FIELD(bfloat16_enabled_op_types, Bfloat16EnabledOpTypes,
                       std::unordered_set<std::string>);
+
+  DECL_ARGUMENT_FIELD(use_mkldnn_int8, UseMkldnnInt8, bool);
 #endif
 
   // Passed from config.
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index d6eb39e767825..b2d8afaa7b49c 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -107,6 +107,10 @@ void IRPassManager::CreatePasses(Argument *argument,
           "quantize_excluded_op_ids",
           new std::unordered_set<int>(argument->quantize_excluded_op_ids()));
     } else if (pass_name == "cpu_quantize_pass") {
+      if (argument->quantize_enabled_op_types().count("conv2d") ||
+          argument->quantize_enabled_op_types().count("depthwise_conv2d")) {
+        pass->Set("data_layout", new std::string("NHWC"));
+      }
       pass->Set("quant_var_scales",
                 new VarQuantScale(argument->quant_var_scales()));
     } else if (pass_name == "cpu_bfloat16_placement_pass") {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index d08d28a3f6233..4827fe6c1ac97 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -261,6 +261,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_mkldnn_bfloat16_);
   CP_MEMBER(bfloat16_enabled_op_types_);
   // Quantization related.
+  CP_MEMBER(use_mkldnn_int8_);
+  CP_MEMBER(quantize_enabled_op_types_);
+  CP_MEMBER(quantize_excluded_op_ids_);
   CP_MEMBER(use_mkldnn_quantizer_);
   CP_MEMBER(mkldnn_quantizer_config_);
   CP_MEMBER(min_input_shape_);
@@ -435,6 +438,35 @@ void AnalysisConfig::EnableMkldnnBfloat16() {
   Update();
 }
 
+void AnalysisConfig::EnableMkldnnInt8(
+    const std::unordered_set<std::string> &op_list) {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_int8_ = true;
+  use_fc_padding_ = false;
+  if (!op_list.empty()) {
+    for (auto &type : op_list) {
+      if (!quantize_enabled_op_types_.count(type)) {
+        LOG(ERROR) << "There are unsupported operators in the configured "
+                      "quantization operator list. The unsupported operator "
+                      "is: "
+                   << type;
+        use_mkldnn_int8_ = false;
+        break;
+      }
+    }
+    if (use_mkldnn_int8_) {
+      quantize_enabled_op_types_.clear();
+      quantize_enabled_op_types_.insert(op_list.begin(), op_list.end());
+    }
+  }
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnInt8";
+  use_mkldnn_int8_ = false;
+#endif
+
+  Update();
+}
+
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
                           platform::errors::PreconditionNotMet(
@@ -632,6 +664,20 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_mkldnn_int8_) {
+#ifdef PADDLE_WITH_MKLDNN
+    if (!enable_ir_optim_) {
+      LOG(ERROR) << "EnableMkldnnInt8() only works when IR optimization "
+                    "is enabled.";
+    } else if (!use_mkldnn_) {
+      LOG(ERROR) << "EnableMkldnnInt8() only works when MKLDNN "
+                    "is enabled.";
+    } else {
+      pass_builder()->EnableMkldnnInt8();
+    }
+#endif
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   // Do not optimize when mkldnn is on
   if (enable_memory_optim_ && !use_mkldnn_) {
@@ -731,6 +777,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << use_mkldnn_quantizer_;
   ss << use_mkldnn_bfloat16_;
   for (auto &item : bfloat16_enabled_op_types_) ss << item;
+  ss << use_mkldnn_int8_;
+  for (auto &item : quantize_enabled_op_types_) ss << item;
+  for (auto &item : quantize_excluded_op_ids_) ss << item;
   ss << ";";
   ss << model_from_memory_;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 99d3f790e253c..f1d56000b03ca 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -949,6 +949,13 @@ void AnalysisPredictor::PrepareArgument() {
     LOG(INFO) << "Bfloat16 is enabled";
     argument_.SetBfloat16EnabledOpTypes(config_.bfloat16_enabled_op_types_);
   }
+
+  if (config_.use_mkldnn_int8_) {
+    LOG(INFO) << "Int8 is enabled";
+    argument_.SetQuantizeEnabledOpTypes(config_.quantize_enabled_op_types_);
+    argument_.SetQuantizeExcludedOpIds(config_.quantize_excluded_op_ids_);
+    argument_.SetQuantVarScales({});
+  }
 #endif
 
   auto passes = config_.pass_builder()->AllPasses();
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index bdfe0e46e9ca4..d25f51e4fd41e 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -712,6 +712,20 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief Turn on MKLDNN int8.
+  ///
+  /// \param op_list The operator type list.
+  ///
+  void EnableMkldnnInt8(const std::unordered_set<std::string>& op_list = {});
+
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN Int8.
+  ///
+  /// \return bool Whether to use the MKLDNN Int8.
+  ///
+  bool mkldnn_int8_enabled() const { return use_mkldnn_int8_; }
+
   ///
   /// \brief Turn on MKLDNN bfloat16.
   ///
@@ -981,6 +995,26 @@ struct PD_INFER_DECL AnalysisConfig {
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
   bool use_mkldnn_bfloat16_{false};
   std::unordered_set<std::string> bfloat16_enabled_op_types_;
+  bool use_mkldnn_int8_{false};
+  std::unordered_set<int> quantize_excluded_op_ids_{};
+  std::unordered_set<std::string> quantize_enabled_op_types_{
+      "concat",
+      "conv2d",
+      "depthwise_conv2d",
+      "elementwise_add",
+      "elementwise_mul",
+      "fc",
+      "matmul",
+      "nearest_interp",
+      "nearest_interp_v2",
+      "pool2d",
+      "prior_box",
+      "reshape2",
+      "transpose2",
+      "fusion_gru",
+      "fusion_lstm",
+      "multi_gru",
+      "slice"};
 
   // ipu related.
   bool use_ipu_{false};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index d0fe3953d00d6..ce733c53059b7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -220,6 +220,10 @@ void GpuPassStrategy::EnableMkldnnBfloat16() {
   LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
 }
 
+void GpuPassStrategy::EnableMkldnnInt8() {
+  LOG(ERROR) << "GPU not support MKL-DNN int8";
+}
+
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
@@ -339,6 +343,75 @@ void CpuPassStrategy::EnableMkldnnBfloat16() {
 #endif
 }
 
+void CpuPassStrategy::EnableMkldnnInt8() {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_int8_) {
+    passes_.clear();
+    passes_.push_back("quant_dequant_mkldnn_pass");
+    passes_.push_back("layer_norm_fuse_pass");
+    passes_.push_back("attention_lstm_fuse_pass");
+    passes_.push_back("seqconv_eltadd_relu_fuse_pass");
+    passes_.push_back("fc_lstm_fuse_pass");
+    passes_.push_back("mul_lstm_fuse_pass");
+    passes_.push_back("fc_gru_fuse_pass");
+    passes_.push_back("mul_gru_fuse_pass");
+    passes_.push_back("multi_gru_fuse_pass");
+    passes_.push_back("multi_gru_seq_fuse_pass");
+    passes_.push_back("seq_concat_fc_fuse_pass");
+    passes_.push_back("gpu_cpu_squeeze2_matmul_fuse_pass");
+    passes_.push_back("gpu_cpu_reshape2_matmul_fuse_pass");
+    passes_.push_back("gpu_cpu_flatten2_matmul_fuse_pass");
+    passes_.push_back("matmul_v2_scale_fuse_pass");
+    passes_.push_back("squared_mat_sub_fuse_pass");
+    passes_.push_back("is_test_pass");
+    passes_.push_back("gpu_cpu_map_matmul_v2_to_mul_pass");
+    passes_.push_back("gpu_cpu_map_matmul_v2_to_matmul_pass");
+    passes_.push_back("matmul_scale_fuse_pass");
+    passes_.push_back("gpu_cpu_map_matmul_to_mul_pass");
+    passes_.push_back("repeated_fc_relu_fuse_pass");
+    passes_.push_back("mkldnn_placement_pass");
+    passes_.push_back("depthwise_conv_mkldnn_pass");
+    passes_.push_back("conv_bn_fuse_pass");
+    passes_.push_back("conv_eltwiseadd_bn_fuse_pass");
+    passes_.push_back("conv_transpose_bn_fuse_pass");
+    passes_.push_back("conv_transpose_eltwiseadd_bn_fuse_pass");
+    passes_.push_back("conv_bias_mkldnn_fuse_pass");
+    passes_.push_back("conv_transpose_bias_mkldnn_fuse_pass");
+    passes_.push_back("conv_elementwise_add_mkldnn_fuse_pass");
+    passes_.push_back("conv_concat_relu_mkldnn_fuse_pass");
+    passes_.push_back("conv_relu_mkldnn_fuse_pass");
+    passes_.push_back("conv_leaky_relu_mkldnn_fuse_pass");
+    passes_.push_back("conv_relu6_mkldnn_fuse_pass");
+    passes_.push_back("conv_swish_mkldnn_fuse_pass");
+    passes_.push_back("conv_hard_swish_mkldnn_fuse_pass");
+    passes_.push_back("conv_mish_mkldnn_fuse_pass");
+    passes_.push_back("conv_hard_sigmoid_mkldnn_fuse_pass");
+    passes_.push_back("conv_gelu_mkldnn_fuse_pass");
+    passes_.push_back("fc_fuse_pass");
+    passes_.push_back("repeated_fc_relu_fuse_pass");
+    passes_.push_back("fc_mkldnn_pass");
+    passes_.push_back("fc_act_mkldnn_fuse_pass");
+    passes_.push_back("matmul_transpose_reshape_fuse_pass");
+    passes_.push_back("matmul_v2_transpose_reshape_fuse_pass");
+    passes_.push_back("batch_norm_act_fuse_pass");
+    passes_.push_back("softplus_activation_mkldnn_fuse_pass");
+    passes_.push_back("compute_propagate_scales_mkldnn_pass");
+    passes_.push_back("scale_matmul_fuse_pass");
+    passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass");
+    passes_.push_back("reshape_transpose_matmul_v2_mkldnn_fuse_pass");
+    passes_.push_back("cpu_quantize_placement_pass");
+    passes_.push_back("cpu_quantize_pass");
+    passes_.push_back("cpu_quantize_squash_pass");
+    passes_.push_back("simplify_with_basic_ops_pass");
+    passes_.push_back("mkldnn_inplace_pass");
+    passes_.push_back("runtime_context_cache_pass");
+  }
+  use_mkldnn_int8_ = true;
+#else
+  use_mkldnn_int8_ = false;
+#endif
+}
+
 IpuPassStrategy::IpuPassStrategy() : PassStrategy({}) {
   passes_.assign({"inference_process_pass"});
 }
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 02290ed33ff1c..231ee2cb1e8e6 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -139,6 +139,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable MKLDNN bfloat16.
   virtual void EnableMkldnnBfloat16() {}
 
+  /// \brief Enable MKLDNN int8.
+  virtual void EnableMkldnnInt8() {}
+
   /// \brief Check if we are using gpu.
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
@@ -189,6 +192,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
     use_mkldnn_ = other.use_mkldnn_;
     use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
     use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
+    use_mkldnn_int8_ = other.use_mkldnn_int8_;
   }
   /// \brief Default destructor.
   virtual ~CpuPassStrategy() = default;
@@ -205,10 +209,14 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable MKLDNN bfloat16.
   void EnableMkldnnBfloat16() override;
 
+  /// \brief Enable MKLDNN int8.
+  void EnableMkldnnInt8() override;
+
  protected:
   /// \cond Protected
   bool use_mkldnn_quantizer_{false};
   bool use_mkldnn_bfloat16_{false};
+  bool use_mkldnn_int8_{false};
   /// \endcond
 };
 
@@ -243,6 +251,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Not supported in GPU mode yet.
   void EnableMkldnnBfloat16() override;
 
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnInt8() override;
+
   /// \brief Default destructor.
   virtual ~GpuPassStrategy() = default;
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8c96499a022f7..06d1cd0814eb2 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -168,7 +168,7 @@ function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary
              --disable_mkldnn_fc=${disable_fc}) 
 endfunction()
 
-function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path)
+function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path enable_quant_int8)
     inference_analysis_test_run(${TARGET_NAME}
     COMMAND ${test_binary}
         ARGS --fp32_model=${fp32_model_dir}
@@ -176,6 +176,7 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
              --infer_data=${data_path}
              --batch_size=50
              --enable_int8=true
+             --enable_quant_int8=${enable_quant_int8}
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
              --with_accuracy_layer=false
              --iterations=2)
@@ -554,7 +555,20 @@ if(WITH_MKLDNN)
       download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
   endif(NOT LINUX)
   download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
-  inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
+  inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH} false)
+
+  # Quant2 MobileNetV1
+  inference_analysis_api_quant_test_run(test_analyzer_quant2_mobilenetv1_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${IMAGENET_DATA_PATH} true)
+
+  # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+  # with weight scales in `fake_channel_wise_dequantize_max_abs` operators
+  set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_DATA_DIR}/ResNet50_quant2_channelwise")
+  set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
+  if(NOT LINUX)
+      download_quant_data_without_verify(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
+  endif(NOT LINUX)
+  set(QUANT2_RESNET50_MODEL ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise)
+  inference_analysis_api_quant_test_run(test_analyzer_quant2_resnet50_channelwise_mkldnn ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_RESNET50_MODEL} ${QUANT2_RESNET50_MODEL} ${IMAGENET_DATA_PATH} true)
 
   ### Other tests
  
@@ -774,6 +788,8 @@ if(WITH_MKLDNN)
     set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_quant2_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_quant2_resnet50_channelwise_mkldnn PROPERTIES TIMEOUT 120)
 endif()
 
 set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
diff --git a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
index 5e867fc87fea3..4bb59f3c8df42 100644
--- a/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
@@ -26,8 +26,7 @@ namespace analysis {
 void SetConfig(AnalysisConfig *cfg, std::string model_path) {
   cfg->SetModel(model_path);
   cfg->DisableGpu();
-  cfg->SwitchIrOptim(false);
-  cfg->SwitchSpecifyInputNames();
+  cfg->SwitchIrOptim(true);
   cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
   if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN();
 }
@@ -113,9 +112,11 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
 TEST(Analyzer_quant_image_classification, quantization) {
   AnalysisConfig fp32_cfg;
   SetConfig(&fp32_cfg, FLAGS_fp32_model);
+  fp32_cfg.EnableMKLDNN();
 
   AnalysisConfig int8_cfg;
   SetConfig(&int8_cfg, FLAGS_int8_model);
+  if (FLAGS_enable_quant_int8) int8_cfg.EnableMkldnnInt8();
 
   // read data from file and prepare batches with test data
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index e63dfd14175b9..f2df018f4978a 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -53,6 +53,7 @@ DEFINE_bool(with_accuracy_layer, true,
 DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
 DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction");
 DEFINE_bool(enable_int8, false, "Enable INT8 type prediction");
+DEFINE_bool(enable_quant_int8, false, "Enable QUANT INT8 type prediction");
 DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
 // setting iterations to 0 means processing the whole dataset
 DEFINE_int32(iterations, 0, "number of batches to process");
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 97f3722008769..91d5d39622714 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -695,6 +695,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("set_mkldnn_cache_capacity", &AnalysisConfig::SetMkldnnCacheCapacity,
            py::arg("capacity") = 0)
       .def("set_bfloat16_op", &AnalysisConfig::SetBfloat16Op)
+      .def("enable_mkldnn_int8", &AnalysisConfig::EnableMkldnnInt8,
+           py::arg("mkldnn_int8_enabled_op_types") =
+               std::unordered_set<std::string>({}))
+      .def("mkldnn_int8_enabled", &AnalysisConfig::mkldnn_int8_enabled)
 #endif
       .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)

From 2ab986aeb7eecc7c28dc5b1907bf3f5ca72911e4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 14 Apr 2022 11:01:26 +0800
Subject: [PATCH 10/19] [Phi] Unify dispatch macros to visit (#41653)

* chnage dispatch to visit

* resolve conflict
---
 paddle/phi/api/ext/dispatch.h                 | 318 ++--------------
 paddle/phi/api/lib/data_transform.cc          |   1 -
 paddle/phi/core/visit_type.h                  | 338 ++++++++++++++++++
 paddle/phi/kernels/cpu/cast_grad_kernel.cc    |   2 +
 paddle/phi/kernels/cpu/cast_impl.h            |   2 +-
 paddle/phi/kernels/cpu/cast_kernel.cc         |   1 +
 .../kernels/cpu/cross_entropy_grad_kernel.cc  |   6 +-
 paddle/phi/kernels/cpu/elementwise_kernel.cc  |   1 -
 paddle/phi/kernels/cpu/reduce.h               |   2 +-
 paddle/phi/kernels/cpu/transpose_kernel.cc    |   3 +-
 paddle/phi/kernels/funcs/reduce_function.h    |   1 -
 paddle/phi/kernels/gpu/cast_grad_kernel.cu    |   4 +-
 paddle/phi/kernels/gpu/cast_impl.h            |   2 +-
 paddle/phi/kernels/gpu/cast_kernel.cu         |   4 +-
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |   6 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   |  30 +-
 paddle/phi/kernels/gpu/reduce.h               |   1 +
 paddle/phi/kernels/gpu/reduce_grad.h          |   2 +-
 paddle/phi/kernels/gpu/transpose_kernel.cu    |   1 -
 .../sparse/cpu/convolution_grad_kernel.cc     |   5 +-
 .../kernels/sparse/cpu/convolution_kernel.cc  |   5 +-
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  |   7 +-
 .../sparse/cpu/sparse_pool_grad_kernel.cc     |   5 +-
 .../kernels/sparse/cpu/sparse_pool_kernel.cc  |   5 +-
 .../phi/kernels/sparse/gpu/convolution.cu.h   |   2 +-
 .../sparse/gpu/convolution_grad_kernel.cu     |   5 +-
 .../kernels/sparse/gpu/convolution_kernel.cu  |   5 +-
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  |   7 +-
 .../sparse/gpu/sparse_pool_grad_kernel.cu     |   5 +-
 .../kernels/sparse/gpu/sparse_pool_kernel.cu  |   5 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |   2 +-
 paddle/phi/kernels/xpu/full_kernel.cc         |   2 +-
 32 files changed, 421 insertions(+), 364 deletions(-)
 create mode 100644 paddle/phi/core/visit_type.h

diff --git a/paddle/phi/api/ext/dispatch.h b/paddle/phi/api/ext/dispatch.h
index 6b6d0ae7fe723..aa9cd0f53a4c6 100644
--- a/paddle/phi/api/ext/dispatch.h
+++ b/paddle/phi/api/ext/dispatch.h
@@ -14,327 +14,57 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/visit_type.h"
 
 namespace paddle {
 
-///////// Basic Marco ///////////
-
-#define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
-  case enum_type: {                                                       \
-    using HINT = type;                                                    \
-    __VA_ARGS__();                                                        \
-    break;                                                                \
-  }
-
-#define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
-  PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__)
+// Note: Keep this file only for compatibility with custom operators
 
 ///////// Floating Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                       \
-  [&] {                                                                   \
-    const auto& __dtype__ = TYPE;                                         \
-    switch (__dtype__) {                                                  \
-      PD_PRIVATE_CASE_TYPE(                                               \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
-      PD_PRIVATE_CASE_TYPE(                                               \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)         \
-      default:                                                            \
-        PD_THROW("function " #NAME " is not implemented for data type `", \
-                 __dtype__,                                               \
-                 "`");                                                    \
-    }                                                                     \
-  }()
+#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_FLOATING_TYPES(TYPE, NAME, __VA_ARGS__)
 
-#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)               \
-  [&] {                                                                    \
-    const auto& __dtype__ = TYPE;                                          \
-    switch (__dtype__) {                                                   \
-      PD_PRIVATE_CASE_TYPE(                                                \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)           \
-      PD_PRIVATE_CASE_TYPE(                                                \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)          \
-      PD_PRIVATE_CASE_TYPE(                                                \
-          NAME, ::paddle::DataType::FLOAT16, paddle::float16, __VA_ARGS__) \
-      default:                                                             \
-        PD_THROW("function " #NAME " is not implemented for data type `",  \
-                 __dtype__,                                                \
-                 "`");                                                     \
-    }                                                                      \
-  }()
+#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_FLOATING_AND_HALF_TYPES(TYPE, NAME, __VA_ARGS__)
 
 ///////// Integral Dispatch Marco ///////////
 
-#define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
-      default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `",     \
-                 __dtype__,                                                   \
-                 "`");                                                        \
-    }                                                                         \
-  }()
+#define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__)
 
 ///////// Complex Dispatch Marco ///////////
 
-#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                        \
-  [&] {                                                                   \
-    const auto& __dtype__ = TYPE;                                         \
-    switch (__dtype__) {                                                  \
-      PD_PRIVATE_CASE_TYPE(NAME,                                          \
-                           ::paddle::DataType::COMPLEX64,                 \
-                           ::paddle::complex64,                           \
-                           __VA_ARGS__)                                   \
-      PD_PRIVATE_CASE_TYPE(NAME,                                          \
-                           ::paddle::DataType::COMPLEX128,                \
-                           ::paddle::complex128,                          \
-                           __VA_ARGS__)                                   \
-      default:                                                            \
-        PD_THROW("function " #NAME " is not implemented for data type `", \
-                 __dtype__,                                               \
-                 "`");                                                    \
-    }                                                                     \
-  }()
+#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_COMPLEX_TYPES(TYPE, NAME, __VA_ARGS__)
 
 ///////// Floating and Integral Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
-      default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `",     \
-                 __dtype__,                                                   \
-                 "`");                                                        \
-    }                                                                         \
-  }()
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, __VA_ARGS__)
 
 ///////// Floating and Complex Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)           \
-  [&] {                                                                   \
-    const auto& __dtype__ = TYPE;                                         \
-    switch (__dtype__) {                                                  \
-      PD_PRIVATE_CASE_TYPE(                                               \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
-      PD_PRIVATE_CASE_TYPE(                                               \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)         \
-      PD_PRIVATE_CASE_TYPE(NAME,                                          \
-                           ::paddle::DataType::COMPLEX64,                 \
-                           ::paddle::complex64,                           \
-                           __VA_ARGS__)                                   \
-      PD_PRIVATE_CASE_TYPE(NAME,                                          \
-                           ::paddle::DataType::COMPLEX128,                \
-                           ::paddle::complex128,                          \
-                           __VA_ARGS__)                                   \
-      default:                                                            \
-        PD_THROW("function " #NAME " is not implemented for data type `", \
-                 __dtype__,                                               \
-                 "`");                                                    \
-    }                                                                     \
-  }()
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, __VA_ARGS__)
 
 ///////// Floating and Complex and other type Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_1_TYPES(                      \
-    SPECIFIED_TYPE, TYPE, NAME, ...)                                       \
-  [&] {                                                                    \
-    const auto& __dtype__ = TYPE;                                          \
-    switch (__dtype__) {                                                   \
-      PD_PRIVATE_CASE_TYPE(                                                \
-          NAME,                                                            \
-          SPECIFIED_TYPE,                                                  \
-          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE>::type, \
-          __VA_ARGS__)                                                     \
-      PD_PRIVATE_CASE_TYPE(                                                \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)           \
-      PD_PRIVATE_CASE_TYPE(                                                \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)          \
-      PD_PRIVATE_CASE_TYPE(NAME,                                           \
-                           ::paddle::DataType::COMPLEX64,                  \
-                           ::paddle::complex64,                            \
-                           __VA_ARGS__)                                    \
-      PD_PRIVATE_CASE_TYPE(NAME,                                           \
-                           ::paddle::DataType::COMPLEX128,                 \
-                           ::paddle::complex128,                           \
-                           __VA_ARGS__)                                    \
-      default:                                                             \
-        PD_THROW("function " #NAME " is not implemented for data type `",  \
-                 __dtype__,                                                \
-                 "`");                                                     \
-    }                                                                      \
-  }()
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_1_TYPE( \
+    SPECIFIED_TYPE, TYPE, NAME, ...)                 \
+  PD_VISIT_FLOATING_AND_COMPLEX_AND_1_TYPE(          \
+      SPECIFIED_TYPE, TYPE, NAME, __VA_ARGS__)
 
 ///////// Floating and Complex and 2 other type Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES(                       \
-    SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...)                      \
-  [&] {                                                                     \
-    const auto& __dtype__ = TYPE;                                           \
-    switch (__dtype__) {                                                    \
-      PD_PRIVATE_CASE_TYPE(                                                 \
-          NAME,                                                             \
-          SPECIFIED_TYPE1,                                                  \
-          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE1>::type, \
-          __VA_ARGS__)                                                      \
-      PD_PRIVATE_CASE_TYPE(                                                 \
-          NAME,                                                             \
-          SPECIFIED_TYPE2,                                                  \
-          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE2>::type, \
-          __VA_ARGS__)                                                      \
-      PD_PRIVATE_CASE_TYPE(                                                 \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)            \
-      PD_PRIVATE_CASE_TYPE(                                                 \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)           \
-      PD_PRIVATE_CASE_TYPE(NAME,                                            \
-                           ::paddle::DataType::COMPLEX64,                   \
-                           ::paddle::complex64,                             \
-                           __VA_ARGS__)                                     \
-      PD_PRIVATE_CASE_TYPE(NAME,                                            \
-                           ::paddle::DataType::COMPLEX128,                  \
-                           ::paddle::complex128,                            \
-                           __VA_ARGS__)                                     \
-      default:                                                              \
-        PD_THROW("function " #NAME " is not implemented for data type `",   \
-                 __dtype__,                                                 \
-                 "`");                                                      \
-    }                                                                       \
-  }()
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_AND_2_TYPES(  \
+    SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...) \
+  PD_VISIT_FLOATING_AND_COMPLEX_AND_2_TYPES(           \
+      SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, __VA_ARGS__)
 
 ///////// Floating, Integral and Complex Dispatch Marco ///////////
 
-#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(NAME,                                              \
-                           ::paddle::DataType::COMPLEX64,                     \
-                           ::paddle::complex64,                               \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME,                                              \
-                           ::paddle::DataType::COMPLEX128,                    \
-                           ::paddle::complex128,                              \
-                           __VA_ARGS__)                                       \
-      default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `",     \
-                 __dtype__,                                                   \
-                 "`");                                                        \
-    }                                                                         \
-  }()
-
-// TODO(chenweihang): Add more Marcos in the future if needed
-
-#define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
-  [&] {                                                                        \
-    const auto& __dtype__ = TYPE;                                              \
-    switch (__dtype__) {                                                       \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__)     \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT8, int8_t, __VA_ARGS__)   \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::UINT8, uint8_t, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(NAME,                                               \
-                           ::phi::DataType::BFLOAT16,                          \
-                           paddle::experimental::bfloat16,                     \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME,                                               \
-                           ::phi::DataType::FLOAT16,                           \
-                           paddle::experimental::float16,                      \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(                                                    \
-          NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__)                 \
-      PD_PRIVATE_CASE_TYPE(NAME,                                               \
-                           ::phi::DataType::COMPLEX64,                         \
-                           paddle::experimental::complex64,                    \
-                           __VA_ARGS__)                                        \
-      PD_PRIVATE_CASE_TYPE(NAME,                                               \
-                           ::phi::DataType::COMPLEX128,                        \
-                           paddle::experimental::complex128,                   \
-                           __VA_ARGS__)                                        \
-      default:                                                                 \
-        PADDLE_THROW(phi::errors::InvalidArgument(                             \
-            "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
-    }                                                                          \
-  }()
-
-#define PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(                   \
-    SPECIFIED_TYPE1, SPECIFIED_TYPE2, SPECIFIED_TYPE3, TYPE, NAME, ...)       \
-  [&] {                                                                       \
-    const auto& __dtype__ = TYPE;                                             \
-    switch (__dtype__) {                                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::BOOL, bool, __VA_ARGS__) \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
-      PD_PRIVATE_CASE_TYPE(NAME,                                              \
-                           ::paddle::DataType::COMPLEX64,                     \
-                           ::paddle::complex64,                               \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(NAME,                                              \
-                           ::paddle::DataType::COMPLEX128,                    \
-                           ::paddle::complex128,                              \
-                           __VA_ARGS__)                                       \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME,                                                               \
-          SPECIFIED_TYPE1,                                                    \
-          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE1>::type,   \
-          __VA_ARGS__)                                                        \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME,                                                               \
-          SPECIFIED_TYPE2,                                                    \
-          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE2>::type,   \
-          __VA_ARGS__)                                                        \
-      PD_PRIVATE_CASE_TYPE(                                                   \
-          NAME,                                                               \
-          SPECIFIED_TYPE3,                                                    \
-          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE3>::type,   \
-          __VA_ARGS__)                                                        \
-      default:                                                                \
-        PD_THROW("function " #NAME " is not implemented for data type `",     \
-                 __dtype__,                                                   \
-                 "`");                                                        \
-    }                                                                         \
-  }()
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...) \
+  PD_VISIT_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, __VA_ARGS__)
 
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 82d2e741e9de8..d4e92ded324da 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/data_transform.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/backends/all_context.h"
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
new file mode 100644
index 0000000000000..bd972c8ceedc7
--- /dev/null
+++ b/paddle/phi/core/visit_type.h
@@ -0,0 +1,338 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/api/ext/exception.h"
+#include "paddle/phi/common/data_type.h"
+
+namespace phi {
+
+///////// Basic Marco ///////////
+
+#define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
+  case enum_type: {                                                       \
+    using HINT = type;                                                    \
+    __VA_ARGS__();                                                        \
+    break;                                                                \
+  }
+
+#define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
+  PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__)
+
+///////// Floating Dispatch Marco ///////////
+
+#define PD_VISIT_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  [&] {                                                                   \
+    const auto& __dtype__ = TYPE;                                         \
+    switch (__dtype__) {                                                  \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)         \
+      default:                                                            \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
+                 __dtype__,                                               \
+                 "`");                                                    \
+    }                                                                     \
+  }()
+
+#define PD_VISIT_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)                  \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT16, paddle::float16, __VA_ARGS__) \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `",  \
+                 __dtype__,                                                \
+                 "`");                                                     \
+    }                                                                      \
+  }()
+
+///////// Integral Dispatch Marco ///////////
+
+#define PD_VISIT_INTEGRAL_TYPES(TYPE, NAME, ...)                              \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
+    }                                                                         \
+  }()
+
+///////// Complex Dispatch Marco ///////////
+
+#define PD_VISIT_COMPLEX_TYPES(TYPE, NAME, ...)                           \
+  [&] {                                                                   \
+    const auto& __dtype__ = TYPE;                                         \
+    switch (__dtype__) {                                                  \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX64,                 \
+                           ::paddle::complex64,                           \
+                           __VA_ARGS__)                                   \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX128,                \
+                           ::paddle::complex128,                          \
+                           __VA_ARGS__)                                   \
+      default:                                                            \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
+                 __dtype__,                                               \
+                 "`");                                                    \
+    }                                                                     \
+  }()
+
+///////// Floating and Integral Dispatch Marco ///////////
+
+#define PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)                 \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
+    }                                                                         \
+  }()
+
+///////// Floating and Complex Dispatch Marco ///////////
+
+#define PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)              \
+  [&] {                                                                   \
+    const auto& __dtype__ = TYPE;                                         \
+    switch (__dtype__) {                                                  \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(                                               \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)         \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX64,                 \
+                           ::paddle::complex64,                           \
+                           __VA_ARGS__)                                   \
+      PD_PRIVATE_CASE_TYPE(NAME,                                          \
+                           ::paddle::DataType::COMPLEX128,                \
+                           ::paddle::complex128,                          \
+                           __VA_ARGS__)                                   \
+      default:                                                            \
+        PD_THROW("function " #NAME " is not implemented for data type `", \
+                 __dtype__,                                               \
+                 "`");                                                    \
+    }                                                                     \
+  }()
+
+///////// Floating and Complex and other type Dispatch Marco ///////////
+
+#define PD_VISIT_FLOATING_AND_COMPLEX_AND_1_TYPE(                          \
+    SPECIFIED_TYPE, TYPE, NAME, ...)                                       \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME,                                                            \
+          SPECIFIED_TYPE,                                                  \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE>::type, \
+          __VA_ARGS__)                                                     \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(                                                \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)          \
+      PD_PRIVATE_CASE_TYPE(NAME,                                           \
+                           ::paddle::DataType::COMPLEX64,                  \
+                           ::paddle::complex64,                            \
+                           __VA_ARGS__)                                    \
+      PD_PRIVATE_CASE_TYPE(NAME,                                           \
+                           ::paddle::DataType::COMPLEX128,                 \
+                           ::paddle::complex128,                           \
+                           __VA_ARGS__)                                    \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `",  \
+                 __dtype__,                                                \
+                 "`");                                                     \
+    }                                                                      \
+  }()
+
+///////// Floating and Complex and 2 other type Dispatch Marco ///////////
+
+#define PD_VISIT_FLOATING_AND_COMPLEX_AND_2_TYPES(                          \
+    SPECIFIED_TYPE1, SPECIFIED_TYPE2, TYPE, NAME, ...)                      \
+  [&] {                                                                     \
+    const auto& __dtype__ = TYPE;                                           \
+    switch (__dtype__) {                                                    \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          SPECIFIED_TYPE1,                                                  \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE1>::type, \
+          __VA_ARGS__)                                                      \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME,                                                             \
+          SPECIFIED_TYPE2,                                                  \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE2>::type, \
+          __VA_ARGS__)                                                      \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)            \
+      PD_PRIVATE_CASE_TYPE(                                                 \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME,                                            \
+                           ::paddle::DataType::COMPLEX64,                   \
+                           ::paddle::complex64,                             \
+                           __VA_ARGS__)                                     \
+      PD_PRIVATE_CASE_TYPE(NAME,                                            \
+                           ::paddle::DataType::COMPLEX128,                  \
+                           ::paddle::complex128,                            \
+                           __VA_ARGS__)                                     \
+      default:                                                              \
+        PD_THROW("function " #NAME " is not implemented for data type `",   \
+                 __dtype__,                                                 \
+                 "`");                                                      \
+    }                                                                       \
+  }()
+
+///////// Floating, Integral and Complex Dispatch Marco ///////////
+
+#define PD_VISIT_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)     \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(NAME,                                              \
+                           ::paddle::DataType::COMPLEX64,                     \
+                           ::paddle::complex64,                               \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME,                                              \
+                           ::paddle::DataType::COMPLEX128,                    \
+                           ::paddle::complex128,                              \
+                           __VA_ARGS__)                                       \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
+    }                                                                         \
+  }()
+
+#define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__)     \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT8, int8_t, __VA_ARGS__)   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::UINT8, uint8_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME,                                               \
+                           ::phi::DataType::BFLOAT16,                          \
+                           paddle::experimental::bfloat16,                     \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME,                                               \
+                           ::phi::DataType::FLOAT16,                           \
+                           paddle::experimental::float16,                      \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__)                 \
+      PD_PRIVATE_CASE_TYPE(NAME,                                               \
+                           ::phi::DataType::COMPLEX64,                         \
+                           paddle::experimental::complex64,                    \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME,                                               \
+                           ::phi::DataType::COMPLEX128,                        \
+                           paddle::experimental::complex128,                   \
+                           __VA_ARGS__)                                        \
+      default:                                                                 \
+        PADDLE_THROW(phi::errors::InvalidArgument(                             \
+            "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \
+    }                                                                          \
+  }()
+
+#define PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES(                   \
+    SPECIFIED_TYPE1, SPECIFIED_TYPE2, SPECIFIED_TYPE3, TYPE, NAME, ...)       \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::BOOL, bool, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
+      PD_PRIVATE_CASE_TYPE(NAME,                                              \
+                           ::paddle::DataType::COMPLEX64,                     \
+                           ::paddle::complex64,                               \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME,                                              \
+                           ::paddle::DataType::COMPLEX128,                    \
+                           ::paddle::complex128,                              \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SPECIFIED_TYPE1,                                                    \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE1>::type,   \
+          __VA_ARGS__)                                                        \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SPECIFIED_TYPE2,                                                    \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE2>::type,   \
+          __VA_ARGS__)                                                        \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME,                                                               \
+          SPECIFIED_TYPE3,                                                    \
+          ::paddle::experimental::DataTypeToCppType<SPECIFIED_TYPE3>::type,   \
+          __VA_ARGS__)                                                        \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
+    }                                                                         \
+  }()
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/cast_grad_kernel.cc b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
index c294c743bd4cf..79f53cbce1a4a 100644
--- a/paddle/phi/kernels/cpu/cast_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_grad_kernel.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cast_grad_kernel.h"
+
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/cast_impl.h b/paddle/phi/kernels/cpu/cast_impl.h
index d39ef24e7beb1..9648b584243f5 100644
--- a/paddle/phi/kernels/cpu/cast_impl.h
+++ b/paddle/phi/kernels/cpu/cast_impl.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/api/ext/dispatch.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index b53c94eb4cae2..2132f0d5ae86c 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -16,6 +16,7 @@
 #include "paddle/phi/kernels/cpu/cast_impl.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
index d4a632b5e6ece..021fdac225330 100644
--- a/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cross_entropy_grad_kernel.cc
@@ -16,13 +16,11 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-// TODO(chenweihang): move dispatch.h into phi/core
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 
 template <typename T, typename LabelT>
@@ -200,7 +198,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
                                                axis,
                                                logits_grad);
   } else {
-    PD_DISPATCH_INTEGRAL_TYPES(
+    PD_VISIT_INTEGRAL_TYPES(
         dtype, "CrossEntropyWithSoftmaxGradCPUKernel", ([&] {
           CrossEntropyWithSoftmaxGradCPUKernel<T, data_t>(dev_ctx,
                                                           label,
diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc
index 4ca41de7bb64a..a91ca1ee3244b 100644
--- a/paddle/phi/kernels/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/cpu/elementwise.h"
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h
index af67bdf5d624f..06a458832d19f 100644
--- a/paddle/phi/kernels/cpu/reduce.h
+++ b/paddle/phi/kernels/cpu/reduce.h
@@ -16,8 +16,8 @@
 
 #include <set>
 
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/cast_kernel.h"
 
 #include "paddle/phi/api/lib/utils/storage.h"
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
index 5dc4866e1efc3..a2f5aa2a29795 100644
--- a/paddle/phi/kernels/cpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -13,8 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/transpose_kernel.h"
+
 #include <vector>
-#include "paddle/phi/api/ext/dispatch.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 4eb6ba0310886..b414dfc5d6e84 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -35,7 +35,6 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpu/cast_grad_kernel.cu b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
index 1c1d8cf2c06d4..f4b610301583c 100644
--- a/paddle/phi/kernels/gpu/cast_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_grad_kernel.cu
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_grad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/cast_impl.h b/paddle/phi/kernels/gpu/cast_impl.h
index 8f6351e675cfa..f73d396572541 100644
--- a/paddle/phi/kernels/gpu/cast_impl.h
+++ b/paddle/phi/kernels/gpu/cast_impl.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/phi/api/ext/dispatch.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 40a84648e4b16..a879dc3bafd74 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/gpu/cast_impl.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index 215b94c52b395..c66daf4fe64e6 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -24,15 +24,13 @@ namespace cub = hipcub;
 
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
-// TODO(chenweihang): move dispatch.h into phi/core
-#include "paddle/phi/api/ext/dispatch.h"
-
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
@@ -267,7 +265,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
                                                axis,
                                                logits_grad);
   } else {
-    PD_DISPATCH_INTEGRAL_TYPES(
+    PD_VISIT_INTEGRAL_TYPES(
         dtype, "CrossEntropyWithSoftmaxGradGPUKernel", ([&] {
           CrossEntropyWithSoftmaxGradGPUKernel<T, data_t>(dev_ctx,
                                                           label,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 055706cffd41e..1908c78060483 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -24,15 +24,13 @@ namespace cub = hipcub;
 
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
-// TODO(chenweihang): move dispatch.h into phi/core
-#include "paddle/phi/api/ext/dispatch.h"
-
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
@@ -1529,19 +1527,19 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
                                             softmax,
                                             loss);
   } else {
-    PD_DISPATCH_INTEGRAL_TYPES(
-        dtype, "CrossEntropyWithSoftmaxCUDAKernel", ([&] {
-          CrossEntropyWithSoftmaxCUDAKernel<T, data_t>(dev_ctx,
-                                                       logits,
-                                                       label,
-                                                       soft_label,
-                                                       use_softmax,
-                                                       numeric_stable_mode,
-                                                       ignore_index,
-                                                       axis,
-                                                       softmax,
-                                                       loss);
-        }));
+    PD_VISIT_INTEGRAL_TYPES(dtype, "CrossEntropyWithSoftmaxCUDAKernel", ([&] {
+                              CrossEntropyWithSoftmaxCUDAKernel<T, data_t>(
+                                  dev_ctx,
+                                  logits,
+                                  label,
+                                  soft_label,
+                                  use_softmax,
+                                  numeric_stable_mode,
+                                  ignore_index,
+                                  axis,
+                                  softmax,
+                                  loss);
+                            }));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index a54669c6e9d42..6fb81edd6bf47 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -18,6 +18,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU_KP)
 
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 1e39a08e9cbaf..e1f7419fb7a01 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -23,7 +23,7 @@
 #include <set>
 #include <vector>
 
-#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
index 9ea2af292ccf1..203f10e4ddd47 100644
--- a/paddle/phi/kernels/gpu/transpose_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -14,7 +14,6 @@
 
 #include <vector>
 
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
index 80693c90d1e7f..216685f0f7191 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -13,13 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -191,7 +190,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const bool subm,
                       SparseCooTensor* x_grad,
                       DenseTensor* kernel_grad) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] {
         Conv3dGradCPUKernel<T, data_t>(dev_ctx,
                                        x,
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index a1c8cf014c7fb..c920f3c461287 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -159,7 +158,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
         Conv3dCPUKernel<T, data_t>(dev_ctx,
                                    x,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index a07a7fb2ecf44..c10a240c68430 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -78,7 +77,7 @@ void SparseMaskKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const SparseCooTensor& mask,
                       SparseCooTensor* out) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       mask.non_zero_indices().dtype(), "SparseMaskCPUKernel", ([&] {
         SparseMaskCPUKernel<T, data_t>(dev_ctx, x, mask, out);
       }));
@@ -145,7 +144,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& mask_indices,
                             DenseTensor* out) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] {
         SparseMaskHelperCPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
       }));
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
index 30221975e7756..78b6354f44f9e 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
@@ -14,13 +14,12 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -82,7 +81,7 @@ void MaxPoolGradKernel(const Context& dev_ctx,
                        const SparseCooTensor& out_grad,
                        const std::vector<int>& kernel_sizes,
                        SparseCooTensor* x_grad) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] {
         MaxPoolGradCPUKernel<T, data_t>(
             dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
index ed6e0200587e8..28211a1cda347 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
 #include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -106,7 +105,7 @@ void MaxPoolKernel(const Context& dev_ctx,
                    const std::vector<int>& strides,
                    SparseCooTensor* out,
                    DenseTensor* rulebook) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] {
         MaxPoolCPUKernel<T, data_t>(dev_ctx,
                                     x,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 5662a4fac71c5..1bceb767b6708 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -338,7 +338,7 @@ int ProductRuleBook(const Context& dev_ctx,
                     SparseCooTensor* out,
                     std::vector<int>* h_counter,
                     std::vector<int>* h_offsets) {
-  // TODO(zhangkaihuo): use PD_DISPATCH_INTEGRAL_TYPES for secondary dispatch
+  // TODO(zhangkaihuo): use PD_VISIT_INTEGRAL_TYPES for secondary dispatch
   auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
   const int64_t non_zero_num = x.nnz();
   const auto& non_zero_indices = x.non_zero_indices();
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 2b61be7289646..6c37f759923c3 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -18,14 +18,13 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -249,7 +248,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
                       const bool subm,
                       SparseCooTensor* x_grad,
                       DenseTensor* kernel_grad) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] {
         Conv3dGradGPUKernel<T, data_t>(dev_ctx,
                                        x,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 2d212eadffac1..83f19ce5785df 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -177,7 +176,7 @@ void Conv3dKernel(const Context& dev_ctx,
                   const bool subm,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
         Conv3dGPUKernel<T, data_t>(dev_ctx,
                                    x,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 96ab56697b9b0..dff1cc2318f13 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -19,14 +19,13 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -118,7 +117,7 @@ void SparseMaskKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       const SparseCooTensor& mask,
                       SparseCooTensor* out) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       mask.non_zero_indices().dtype(), "SparseMaskGPUKernel", ([&] {
         SparseMaskGPUKernel<T, data_t>(dev_ctx, x, mask, out);
       }));
@@ -265,7 +264,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx,
                             const SparseCooTensor& x,
                             const DenseTensor& mask_indices,
                             DenseTensor* out) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] {
         SparseMaskHelperGPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
       }));
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
index 8657e7319d8ca..bd862a44afeeb 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
@@ -18,14 +18,13 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -129,7 +128,7 @@ void MaxPoolGradKernel(const Context& dev_ctx,
                        const SparseCooTensor& out_grad,
                        const std::vector<int>& kernel_sizes,
                        SparseCooTensor* x_grad) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] {
         MaxPoolGradGPUKernel<T, data_t>(
             dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
index a59cd3c7a5a78..b76b61f83bfc9 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
@@ -16,12 +16,11 @@ limitations under the License. */
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/sparse/convolution.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
-
 namespace phi {
 namespace sparse {
 
@@ -136,7 +135,7 @@ void MaxPoolKernel(const Context& dev_ctx,
                    const std::vector<int>& strides,
                    SparseCooTensor* out,
                    DenseTensor* rulebook) {
-  PD_DISPATCH_INTEGRAL_TYPES(
+  PD_VISIT_INTEGRAL_TYPES(
       x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
         MaxPoolGPUKernel<T, data_t>(dev_ctx,
                                     x,
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 60df877355b82..f7ecf379fdfa9 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 6668ae39cbdbe..978bdb5129c04 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/phi/kernels/full_kernel.h"
 
-#include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/memory/memcpy.h"

From 5a6182b80481542a81b1cfdc7ff3dcdbf1926d4e Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 14 Apr 2022 11:08:23 +0800
Subject: [PATCH 11/19] infrt run once (A trick version) (#41634)

* temporariliy run once

* update

* update

* update

* update

* fix ci problem
---
 paddle/infrt/dialect/phi/ir/phi_base.cc       |  2 +-
 paddle/infrt/dialect/tensorrt/convert.h       |  2 +-
 paddle/infrt/dialect/tensorrt/trt_exec.cc     |  2 +-
 paddle/infrt/host_context/op_executable.cc    | 11 ++++-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  | 47 +++++++++++++------
 .../infrt/kernel/phi/dense_tensor_kernels.h   |  7 +--
 paddle/infrt/kernel/tensor_kernels.cc         |  7 ++-
 paddle/infrt/kernel/tensorrt/trt_kernels.cc   |  2 +-
 paddle/infrt/kernel/tensorrt/trt_layers.h     | 21 +++++++++
 9 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
index f91381fe72903..1bd6068d3fb96 100644
--- a/paddle/infrt/dialect/phi/ir/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
-#include <llvm/include/llvm/ADT/TypeSwitch.h>
+#include <llvm/ADT/TypeSwitch.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
diff --git a/paddle/infrt/dialect/tensorrt/convert.h b/paddle/infrt/dialect/tensorrt/convert.h
index be363e77848a5..2a242ca285ba8 100644
--- a/paddle/infrt/dialect/tensorrt/convert.h
+++ b/paddle/infrt/dialect/tensorrt/convert.h
@@ -15,7 +15,7 @@
 
 #include <glog/logging.h>
 #include <llvm/Support/ErrorHandling.h>
-#include <llvm/include/mlir/IR/Attributes.h>
+#include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/BuiltinAttributes.h>
 #include <mlir/IR/PatternMatch.h>
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index 2682a744bb056..dcb84ceb50edf 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -87,7 +87,7 @@ int main(int argc, char** argv) {
     std::cout << "\npass failed!\n" << std::endl;
     return 4;
   }
-  // module->dump();
+  module->dump();
   ::infrt::host_context::TestMlir(module.get(), &registry);
   return 0;
 }
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 59a73e7108328..4d588a9c2b523 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -16,6 +16,7 @@
 
 #include <mlir/IR/BuiltinOps.h>
 #include <string>
+#include <unordered_set>
 
 #include "paddle/infrt/host_context/kernel_frame.h"
 #include "paddle/infrt/host_context/kernel_registry.h"
@@ -71,7 +72,15 @@ OpExecutableBuilder::OpExecutableBuilder(const std::string& op_name,
   // TODO(Superjomn) support other device other than CPU.
   CHECK(impl_->kernel_impl) << "No CPU kernel called " << op_name;
 
-  if (op_name == "dt.get_param") {
+  // TODO(wilber): Maybe we can use the MLIR trait or other facilities to remove
+  // the run_once set.
+  std::unordered_set<std::string> run_once_set{
+      "dt.get_param",
+      "trt.create_engine",
+      "phi_dt.create_host_inited_dense_tensor.f32",
+      "phi_dt.create_context.cpu",
+      "phi_dt.create_context.gpu"};
+  if (run_once_set.count(op_name)) {
     impl_->run_once = true;
   }
 }
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index fe1cda0e10028..7ffc8de151075 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -22,6 +22,7 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 #ifdef INFRT_WITH_GPU
 #include <cuda_runtime.h>
@@ -308,34 +309,50 @@ inline size_t SizeOfDataType(::phi::DataType data_type) {
   }
   return 0;
 }
-::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
-                             const ::phi::GPUContext& context,
-                             bool d2h) {
+void GpuMemCpy(const ::phi::DenseTensor& input,
+               const ::phi::GPUContext& context,
+               bool d2h,
+               ::phi::DenseTensor* output) {
   if (d2h) {
-    ::phi::DenseTensor ret(
-        const_cast<::phi::Allocator*>(&context.GetHostAllocator()),
-        input.meta());
     CHECK(input.place().GetType() == ::phi::AllocationType::GPU);
-    // TODO(wilber): Add sync op and stream.
-    cudaMemcpyAsync(ret.data(),
+
+    // TODO(wilber): Just a trick to avoid malloc.
+    if (input.numel() > output->numel()) {
+      // TODO(wilber): Use pinned memory.
+      output->Resize(input.dims());
+      context.HostAlloc(
+          output, input.dtype(), input.numel() * SizeOfDataType(input.dtype()));
+    }
+
+    cudaMemcpyAsync(output->data(),
                     input.data(),
                     SizeOfDataType(input.dtype()) * input.numel(),
                     cudaMemcpyDeviceToHost,
-                    nullptr);
-    return ret;
+                    context.stream());
+    // TODO(wilber): Ir add sync op.
+    cudaStreamSynchronize(context.stream());
   } else {
     // h2d
-    ::phi::DenseTensor ret(
-        const_cast<::phi::Allocator*>(&context.GetAllocator()), input.meta());
     CHECK(input.place().GetType() == ::phi::AllocationType::CPU ||
           input.place().GetType() == ::phi::AllocationType::GPUPINNED);
+
+    if (input.numel() > output->numel()) {
+      output->Resize(input.dims());
+      context.Alloc(output,
+                    input.dtype(),
+                    input.numel() * SizeOfDataType(input.dtype()),
+                    false);
+
+    } else {
+      output->Resize(input.dims());
+    }
+
     // TODO(wilber): Add sync op and stream.
-    cudaMemcpyAsync(ret.data(),
+    cudaMemcpyAsync(output->data(),
                     input.data(),
                     SizeOfDataType(input.dtype()) * input.numel(),
                     cudaMemcpyHostToDevice,
-                    nullptr);
-    return ret;
+                    context.stream());
   }
 }
 #endif
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index b1075444731b5..c401fb99978a3 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -76,9 +76,10 @@ ::infrt::phi::DenseTensorMap LoadCombinedParameters(
 int32_t TensorMapGetSize(const ::infrt::phi::DenseTensorMap& map);
 
 #ifdef INFRT_WITH_GPU
-::phi::DenseTensor GpuMemCpy(const ::phi::DenseTensor& input,
-                             const ::phi::GPUContext& context,
-                             bool d2h);
+void GpuMemCpy(const ::phi::DenseTensor& input,
+               const ::phi::GPUContext& context,
+               bool d2h,
+               ::phi::DenseTensor* output);
 #endif
 
 }  // namespace phi
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 65e137472b3d6..2e952e77d1f0a 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -119,6 +119,7 @@ void NaiveMatmul(const DenseHostTensor &x,
   const int N = w.shape().GetDim(1);
   for (int i = 0; i < M; i++) {
     for (int j = 0; j < N; j++) {
+      out_data[i * N + j] = 0;
       for (int k = 0; k < K; k++) {
         out_data[i * N + j] += x_data[i * K + k] * w_data[k * N + j];
       }
@@ -134,9 +135,11 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       {"shape"});
   registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor));
   registry->AddKernel("dt.fill_tensor_with_constant.f32",
-                      INFRT_KERNEL(FillTensorWithConstant<float>));
+                      INFRT_KERNEL(FillTensorWithConstant<float>),
+                      {"value"});
   registry->AddKernel("dt.fill_tensor_with_constant.f64",
-                      INFRT_KERNEL(FillTensorWithConstant<double>));
+                      INFRT_KERNEL(FillTensorWithConstant<double>),
+                      {"value"});
 
   // TensorMap related methods.
   registry->AddKernel("dt.load_params", INFRT_KERNEL(LoadParams));
diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
index c182dda2705fd..c0f5ebb4a7657 100644
--- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc
+++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc
@@ -57,7 +57,7 @@ ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine(
   // TODO(wilber): The build option shoule be fiiled from mlir info.
   backends::tensorrt::BuildOptions options;
   options.max_batch = 4;
-  options.workspace = 1024;
+  options.workspace = 128;
 
   // Parse mlir Region which only has one block.
   mlir::Operation& operation = *create_engine_op.operation;
diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h
index 9d8eba0bb31f5..0f2c2c88ca097 100644
--- a/paddle/infrt/kernel/tensorrt/trt_layers.h
+++ b/paddle/infrt/kernel/tensorrt/trt_layers.h
@@ -115,6 +115,27 @@ inline void PoolFunc(trt::PoolingOp& op,  // NOLINT
     // TODO(Inference)
     // CHECK(false) << "Not supported adaptive pool";
 
+    // TODO(wilber): Reformat.
+    // global average pooling.
+    auto ksize_vec = ArrayAttrToVec<int>(ksize);
+    if (static_cast<nvinfer1::PoolingType>(pool_type) ==
+            nvinfer1::PoolingType::kAVERAGE &&
+        ksize_vec.size() == 2 && ksize_vec[0] == 1 && ksize_vec[1] == 1) {
+      nvinfer1::Dims dims;
+      dims.nbDims = 2;
+      dims.d[0] = input_shape.d[1];
+      dims.d[1] = input_shape.d[2];
+      auto* layer = network->addPoolingNd(
+          *input_itensor, static_cast<nvinfer1::PoolingType>(pool_type), dims);
+      CHECK_NOTNULL(layer);
+
+      mlir::Value out_repr = op.output_tensor();
+      nvinfer1::ITensor* out_tensor = layer->getOutput(0);
+      value_to_trt_tensor_map[out_repr] = out_tensor;
+      return;
+    }
+
+    // plugin...
     std::vector<int> input_shape_v;
     for (int i = 0; i < input_dims; i++) {
       input_shape_v.push_back(input_shape.d[i]);

From 419d8eb2442ac2f769448e61337466090a5b49bc Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Thu, 14 Apr 2022 11:09:11 +0800
Subject: [PATCH 12/19] support weakref for eager tensor (#41769)

---
 paddle/fluid/pybind/eager.cc | 3 +++
 paddle/fluid/pybind/eager.h  | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index c600844596d98..74d15b6c0ca79 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -709,6 +709,8 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
 }
 
 static void TensorDealloc(TensorObject* self) {
+  if (self->weakrefs != NULL)
+    PyObject_ClearWeakRefs(reinterpret_cast<PyObject*>(self));
   self->tensor.~Tensor();
   Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
 }
@@ -739,6 +741,7 @@ void BindEager(pybind11::module* module) {
   type->tp_getset = variable_properties;
   type->tp_init = TensorInit;
   type->tp_new = TensorNew;
+  type->tp_weaklistoffset = offsetof(TensorObject, weakrefs);
   Py_INCREF(&PyBaseObject_Type);
   type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
   type->tp_flags |=
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
index bb55ef62ee689..03676a677ac90 100644
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -22,6 +22,8 @@ namespace pybind {
 
 typedef struct {
   PyObject_HEAD paddle::experimental::Tensor tensor;
+  // Weak references
+  PyObject* weakrefs;
 } TensorObject;
 
 typedef struct {

From e26e51ba87b343fd63c1bc2a0f8c158f1efd6162 Mon Sep 17 00:00:00 2001
From: xiayanming <xiayanming@baidu.com>
Date: Thu, 14 Apr 2022 11:21:15 +0800
Subject: [PATCH 13/19] [fix bug] communication op suppport rccl (#41763)

---
 paddle/fluid/operators/collective/alltoall_op.cu.cc        | 6 +++---
 .../operators/collective/c_comm_init_multitrainer_op.cc    | 7 +++++--
 paddle/fluid/operators/collective/global_gather_op.cu.cc   | 6 +++---
 paddle/fluid/operators/collective/global_scatter_op.cu.cc  | 6 +++---
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 26fdee200cd84..0e0ea72208488 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/alltoall_op.h"
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -26,7 +26,7 @@ template <typename T>
 class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
@@ -43,7 +43,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
     int nranks = comm->nranks();
 
-    cudaStream_t stream = nullptr;
+    gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index f69fe8f1e3f1f..86c966378ccb6 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL)
 #include <nccl.h>
 #endif
+#if defined(PADDLE_WITH_RCCL)
+#include <rccl.h>
+#endif
 #include <stdint.h>
 #include <ostream>
 #include <string>
@@ -24,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 // #include "paddle/fluid/operators/distributed/distributed.h"
 // #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -51,7 +54,7 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase {
     auto var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::InvalidArgument("Input X must be provided."));
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
 
     int ntrainers = Attr<int>("ntrainers");
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 4f9725a27062b..6684470e881cb 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/global_gather_op.h"
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -26,7 +26,7 @@ template <typename T>
 class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
     auto local_count = ctx.Input<framework::LoDTensor>("local_count");
@@ -79,7 +79,7 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
             ring_id));
     auto place = ctx.GetPlace();
     auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-    cudaStream_t stream = nullptr;
+    gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 3a7e6a0079ac5..cd3c3a3229ca0 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/global_scatter_op.h"
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -26,7 +26,7 @@ template <typename T>
 class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<framework::LoDTensor>("X");
     auto local_count = ctx.Input<framework::LoDTensor>("local_count");
@@ -78,7 +78,7 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     auto place = ctx.GetPlace();
     auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-    cudaStream_t stream = nullptr;
+    gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();

From 192f6f85fc358d681c9e87db961c14edf7595ca3 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Thu, 14 Apr 2022 11:25:12 +0800
Subject: [PATCH 14/19] fix bug of set
 NIGHTLY_MODE;test=document_fix;test=windows_ci (#41758)

---
 paddle/scripts/paddle_build.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 21df60e972121..8b9bfcf46042f 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -75,7 +75,7 @@ if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
 if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 if not defined LOG_LEVEL set LOG_LEVEL=normal
 if not defined PRECISION_TEST set PRECISION_TEST=OFF
-if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
+if not defined NIGHTLY_MODE set NIGHTLY_MODE=OFF
 if not defined retry_times set retry_times=1
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 if not defined BUILD_DIR set BUILD_DIR=build

From de2a3942e96a79ff61c987b874d0757939c6f1bd Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 14 Apr 2022 11:33:13 +0800
Subject: [PATCH 15/19] remove inner_place using (#41768)

---
 paddle/fluid/eager/amp_auto_cast.h            |  2 +-
 paddle/fluid/eager/eager_amp_auto_cast.h      |  2 +-
 paddle/fluid/eager/grad_node_info.cc          |  8 ++++----
 paddle/fluid/eager/grad_node_info.h           |  4 ++--
 paddle/fluid/eager/grad_tensor_holder.cc      |  4 ++--
 paddle/fluid/eager/pylayer/py_layer_node.h    |  2 +-
 .../data_structure_tests/eager_tensor_test.cc |  2 +-
 paddle/fluid/pybind/eager.cc                  |  2 +-
 paddle/fluid/pybind/eager_functions.cc        | 20 +++++++++----------
 paddle/fluid/pybind/eager_method.cc           | 12 +++++------
 paddle/fluid/pybind/eager_properties.cc       |  6 +++---
 paddle/phi/api/include/tensor.h               | 10 ----------
 paddle/phi/api/lib/kernel_dispatch.cc         |  2 +-
 paddle/phi/api/lib/tensor.cc                  | 19 +++---------------
 paddle/phi/api/lib/tensor_method.cc           |  9 ++++-----
 15 files changed, 40 insertions(+), 64 deletions(-)

diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h
index 6d5758adbe526..3a96b23dcebbb 100644
--- a/paddle/fluid/eager/amp_auto_cast.h
+++ b/paddle/fluid/eager/amp_auto_cast.h
@@ -21,7 +21,7 @@ namespace egr {
 
 static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
                             const paddle::experimental::DataType& dst_dtype) {
-  auto place = tensor.inner_place();
+  auto place = tensor.place();
   auto data_type = tensor.dtype();
   if (paddle::platform::is_gpu_place(place) ||
       paddle::platform::is_cuda_pinned_place(place) ||
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index 9bd1ca1f6fe53..ee9da41881b2d 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -20,7 +20,7 @@ namespace egr {
 
 static inline bool NeedCast(const paddle::experimental::Tensor& tensor,
                             const paddle::experimental::DataType& dst_dtype) {
-  auto place = tensor.inner_place();
+  auto place = tensor.place();
   auto data_type = tensor.dtype();
   if (paddle::platform::is_gpu_place(place) ||
       paddle::platform::is_cuda_pinned_place(place) ||
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 6afdd854344eb..72b84b9db3210 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -151,7 +151,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
           "which is illegal."));
 
   meta.SetTensorMeta(dense_tensor->meta());
-  meta.SetPlace(fwd_out.inner_place());
+  meta.SetPlace(fwd_out.place());
 
   if (paddle::framework::IsComplexType(
           paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
@@ -210,7 +210,7 @@ void GradNodeBase::SetGradInMeta(
                                           "with phi::DataType::UNDEFINED,"
                                           "which is illegal."));
       meta.SetTensorMeta(dense_tensor->meta());
-      meta.SetPlace(fwd_out_tensor.inner_place());
+      meta.SetPlace(fwd_out_tensor.place());
 
       if (paddle::framework::IsComplexType(
               paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
@@ -256,7 +256,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
                                           "with phi::DataType::UNDEFINED,"
                                           "which is illegal."));
       meta.SetTensorMeta(dense_tensor->meta());
-      meta.SetPlace(fwd_in.inner_place());
+      meta.SetPlace(fwd_in.place());
     }
   } else {
     VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
@@ -301,7 +301,7 @@ void GradNodeBase::SetGradOutMeta(
                               "phi::DataType::UNDEFINED,"
                               "which is illegal."));
         meta.SetTensorMeta(dense_tensor->meta());
-        meta.SetPlace(fwd_in_tensor.inner_place());
+        meta.SetPlace(fwd_in_tensor.place());
       }
     } else {
       VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index 6a70a16a2416f..decb682bf4517 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -317,11 +317,11 @@ inline void CheckTensor(const paddle::experimental::Tensor& pre,
             paddle::framework::DataType2String(pre.dtype()),
             paddle::framework::DataType2String(post.dtype())));
     PADDLE_ENFORCE_EQ(
-        pre.inner_place(), post.inner_place(),
+        pre.place(), post.place(),
         paddle::platform::errors::PermissionDenied(
             "The place of tensor before(%s) and after(%s) "
             "hook are not consistent",
-            pre.inner_place().DebugString(), post.inner_place().DebugString()));
+            pre.place().DebugString(), post.place().DebugString()));
   }
 }
 
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 183282d6f87b2..27a8c6002e29d 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -53,7 +53,7 @@ void GradTensorHolder::CopyValueFromTensor(
     paddle::experimental::Tensor& buffer_tensor = buffer_[slot_id][rank];
     if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) {
       // Perform deep copy here
-      buffer_tensor.copy_(t, t.inner_place(), false);
+      buffer_tensor.copy_(t, t.place(), false);
       buffer_tensor.set_autograd_meta(t.mutable_autograd_meta());
 
     } else {
@@ -66,7 +66,7 @@ void GradTensorHolder::CopyValueFromTensor(
     if (t.defined()) {
       // Fill 1.0, use full to support complex, one_like don't support it.
       buffer_[slot_id][rank] =
-          paddle::experimental::full(t.shape(), 1, t.dtype(), t.inner_place());
+          paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
     }
   }
 }
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index f2e50494467c7..87e8acf88a694 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -62,7 +62,7 @@ class GradNodePyLayer : public GradNodeBase {
         } else {
           forward_outputs_meta_[i].emplace_back();
         }
-        forward_outputs_place_[i].emplace_back(tensor->inner_place());
+        forward_outputs_place_[i].emplace_back(tensor->place());
       }
     }
   }
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index de9758b73d250..9afe3962faa29 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -96,7 +96,7 @@ TEST(Tensor, MemberFunction) {
   CHECK_EQ(et3.dims(), expected_dim);
   CHECK_EQ(et3.type(), paddle::experimental::DataType::FLOAT32);
   CHECK_EQ(et3.layout(), paddle::experimental::DataLayout::NCHW);
-  CHECK(paddle::platform::is_cpu_place(et3.inner_place()));
+  CHECK(paddle::platform::is_cpu_place(et3.place()));
   VLOG(6) << "Get impl";
   auto* dt3_ptr =
       std::dynamic_pointer_cast<phi::DenseTensor>(et3.impl())->data<float>();
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 74d15b6c0ca79..c529d121f3945 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -137,7 +137,7 @@ void InitTensorWithTensor(TensorObject* self,
                           const paddle::platform::Place& place,
                           const std::string& name) {
   self->tensor.set_name(name);
-  if (place == src.inner_place()) {
+  if (place == src.place()) {
     auto impl = std::static_pointer_cast<phi::DenseTensor>(src.impl());
     self->tensor.set_impl(impl);
     VLOG(4) << "Same place, do ShareDataWith";
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index fb115455357dd..1073cdc83a428 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -554,32 +554,32 @@ static PyObject* eager_api_async_read(PyObject* self, PyObject* args,
       src.is_gpu_pinned(), true,
       platform::errors::InvalidArgument("Required `src` device should be "
                                         "CUDAPinnedPlace, but received %d.",
-                                        src.inner_place()));
+                                        src.place()));
   PADDLE_ENFORCE_EQ(
       dst.is_gpu(), true,
       platform::errors::InvalidArgument(
           "Required `dst` device should be CUDAPlace, but received %d.",
-          dst.inner_place()));
+          dst.place()));
   PADDLE_ENFORCE_EQ(
       index.is_cpu(), true,
       platform::errors::InvalidArgument(
           "Required `index` device should be CPUPlace, but received %d.",
-          index.inner_place()));
+          index.place()));
   PADDLE_ENFORCE_EQ(buffer.is_gpu_pinned(), true,
                     platform::errors::InvalidArgument(
                         "Required `buffer` device should be CUDAPinnedPlace, "
                         "but received %d.",
-                        buffer.inner_place()));
+                        buffer.place()));
   PADDLE_ENFORCE_EQ(
       offset.is_cpu(), true,
       platform::errors::InvalidArgument(
           "Required `offset` device should be CPUPlace, but received %d.",
-          offset.inner_place()));
+          offset.place()));
   PADDLE_ENFORCE_EQ(
       count.is_cpu(), true,
       platform::errors::InvalidArgument(
           "Required `count` device should be CPUPlace, but received %d.",
-          count.inner_place()));
+          count.place()));
 
   auto& src_tensor = src;
   auto* dst_tensor = &dst;
@@ -701,22 +701,22 @@ static PyObject* eager_api_async_write(PyObject* self, PyObject* args,
       src.is_gpu(), true,
       platform::errors::InvalidArgument(
           "Required `src` device should be CUDAPlace, but received %d. ",
-          src.inner_place()));
+          src.place()));
   PADDLE_ENFORCE_EQ(dst.is_gpu_pinned(), true,
                     platform::errors::InvalidArgument(
                         "Required `dst` device should be CUDAPinnedPlace, "
                         "but received %d. ",
-                        dst.inner_place()));
+                        dst.place()));
   PADDLE_ENFORCE_EQ(
       offset.is_cpu(), true,
       platform::errors::InvalidArgument("Required `offset` device should "
                                         "be CPUPlace, but received %d. ",
-                                        offset.inner_place()));
+                                        offset.place()));
   PADDLE_ENFORCE_EQ(
       count.is_cpu(), true,
       platform::errors::InvalidArgument(
           "Required `count` device should be CPUPlace, but received %d. ",
-          count.inner_place()));
+          count.place()));
 
   // TODO(daisiming): In future, add index as arguments following
   // async_read.
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 6dbed97a55f40..4610196726e75 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -342,11 +342,11 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
         ->SetPersistable(
             egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
     if (src_tensor.initialized()) {
-      self->tensor.copy_(src_tensor, src_tensor.inner_place(), blocking);
+      self->tensor.copy_(src_tensor, src_tensor.place(), blocking);
     }
   } else {
     if (src_tensor.initialized()) {
-      self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
+      self->tensor.copy_(src_tensor, self->tensor.place(), blocking);
     }
   }
 
@@ -934,7 +934,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       } else {
         SetTensorFromPyArray(
             static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
-            value, value_tensor_tmp.inner_place(), false);
+            value, value_tensor_tmp.place(), false);
       }
 
       value_tensor = value_tensor_tmp;
@@ -1018,7 +1018,7 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
                            platform::Place(platform::CPUPlace()), false);
 #endif
     } else {
-      SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.inner_place(),
+      SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.place(),
                            false);
     }
   }
@@ -1367,7 +1367,7 @@ static PyObject* tensor_method__share_memory(TensorObject* self, PyObject* args,
                                              PyObject* kwargs) {
   EAGER_TRY
 #ifndef _WIN32
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.inner_place()), true,
+  PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.place()), true,
                     platform::errors::InvalidArgument(
                         "Sharing memory only support CPU Tensor currently"));
   // 1. get LoDTensor
@@ -1419,7 +1419,7 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
                     platform::errors::InvalidArgument(
                         "Unified virtual addressing only support "
                         "DenseTensor currently."));
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.inner_place()), true,
+  PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->tensor.place()), true,
                     platform::errors::InvalidArgument(
                         "Unified virtual addressing only support "
                         "CPU Tensor currently."));
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index a72ea6c4b02e1..797b68fcb36ea 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -108,7 +108,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value,
                      "Detected NULL grad"
                      "Please check if you have manually cleared"
                      "the grad inside autograd_meta"));
-  grad->copy_(src, self->tensor.inner_place(), true);
+  grad->copy_(src, self->tensor.place(), true);
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
@@ -160,14 +160,14 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
 
 PyObject* tensor_properties_get_place(TensorObject* self, void* closure) {
   EAGER_TRY
-  return ToPyObject(self->tensor.inner_place());
+  return ToPyObject(self->tensor.place());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
 PyObject* tensor_properties_get_place_str(TensorObject* self, void* closure) {
   EAGER_TRY
   std::stringstream ostr;
-  ostr << self->tensor.inner_place();
+  ostr << self->tensor.place();
   return ToPyObject(ostr.str());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index d3efb7ca1c21e..3c3da4b749ed0 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -249,21 +249,11 @@ class PADDLE_API Tensor final {
 
   /**
    * @brief Return the place (device) of Tensor.
-   * This is a deprecated method and may be removed in the future!
    *
    * @return Place
    */
   Place place() const;
 
-  /**
-   * @brief Return the place (device) of Tensor.
-   *
-   * This is a deprecated method and may be removed in the future!!!
-   *
-   * @return Place
-   */
-  Place inner_place() const;
-
   /**
    * @brief Determine whether the tensor device is CPU
    *
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 6d97dc7657f00..a534f02663dff 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -126,7 +126,7 @@ Backend ParseBackend(const Place& place) {
   return phi::TransToPhiBackend(place);
 }
 Backend ParseBackend(const Tensor& tensor) {
-  return phi::TransToPhiBackend(tensor.inner_place());
+  return phi::TransToPhiBackend(tensor.place());
 }
 
 Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 07204b7ffcf61..7eff846bbc1e3 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -163,25 +163,12 @@ Place Tensor::place() const {
   return impl_->place();
 }
 
-Place Tensor::inner_place() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      impl_,
-      phi::errors::PermissionDenied(
-          "Null pointer error, the impl_ of Tensor should not be "
-          "Null when calling Tensor::inner_place()."));
-  return impl_->place();
-}
-
-bool Tensor::is_cpu() const {
-  return paddle::platform::is_cpu_place(inner_place());
-}
+bool Tensor::is_cpu() const { return paddle::platform::is_cpu_place(place()); }
 
-bool Tensor::is_gpu() const {
-  return paddle::platform::is_gpu_place(inner_place());
-}
+bool Tensor::is_gpu() const { return paddle::platform::is_gpu_place(place()); }
 
 bool Tensor::is_gpu_pinned() const {
-  return paddle::platform::is_cuda_pinned_place(inner_place());
+  return paddle::platform::is_cuda_pinned_place(place());
 }
 
 /* Part 4: Data Access methods */
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 46ca457b2c10a..79519f67d2ad3 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -97,16 +97,15 @@ void Tensor::copy_(const Tensor &src,
                           name(),
                           src.name()));
     PADDLE_ENFORCE_EQ(target_place,
-                      inner_place(),
+                      place(),
                       phi::errors::PreconditionNotMet(
                           "Place is different of dst tensor and args %s, which "
                           "current tensor holds %s "
                           "Copy cannot be performed!",
                           target_place,
-                          inner_place()));
-    kernel_key_set.backend_set =
-        kernel_key_set.backend_set |
-        BackendSet(phi::TransToPhiBackend(inner_place()));
+                          place()));
+    kernel_key_set.backend_set = kernel_key_set.backend_set |
+                                 BackendSet(phi::TransToPhiBackend(place()));
   } else {
     // Deep Copy AutoGrad info from src to self.
     *autograd_meta_ = *(src.autograd_meta_);

From 4ae76d2179cf9812f76ea91ab8eb6007a5098ec7 Mon Sep 17 00:00:00 2001
From: Aurelius84 <liujiezhangbupt@gmail.com>
Date: Thu, 14 Apr 2022 12:03:39 +0800
Subject: [PATCH 16/19] [Op]Fix adam/adamw beta1_pow/beta2_pow place while
 copying (#41732)

---
 paddle/phi/kernels/gpu/adamw_kernel.cu               | 4 ++--
 paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu  | 4 ++--
 paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 3555df11b5e1f..4873ba9c13d48 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -190,8 +190,8 @@ void AdamwDenseKernel(const Context& dev_ctx,
     phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
     phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
     phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
-    phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
-    phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
+    phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out);
+    phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out);
     return;
   }
 
diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
index 2cb086503283b..31abac149951d 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -139,8 +139,8 @@ void AdamDenseParamSparseGradKernel(
     phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
     phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
     phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
-    phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
-    phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
+    phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out);
+    phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out);
     return;
   }
 
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index 0fc223e081506..b847f48d12267 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -156,8 +156,8 @@ void AdamwDenseParamSparseGradKernel(
     phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
     phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
     phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
-    phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
-    phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
+    phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out);
+    phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out);
     return;
   }
 

From e0abb90b666b51c345070a180de4a6c0aab41c8d Mon Sep 17 00:00:00 2001
From: Vigi Zhang <VigiZhang@users.noreply.github.com>
Date: Thu, 14 Apr 2022 12:06:45 +0800
Subject: [PATCH 17/19] add security policy (#41749)

---
 SECURITY.md           | 58 +++++++++++++++++++++++++++++++++++++++++++
 SECURITY_cn.md        | 44 ++++++++++++++++++++++++++++++++
 security/README.md    | 12 +++++++++
 security/README_cn.md | 12 +++++++++
 4 files changed, 126 insertions(+)
 create mode 100644 SECURITY.md
 create mode 100644 SECURITY_cn.md
 create mode 100644 security/README.md
 create mode 100644 security/README_cn.md

diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000000..490c804e9de9d
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,58 @@
+# Using PaddlePaddle Securely
+
+This document describes model security and code security in PaddlePaddle. It also provides guidelines on how to report vulnerabilities in PaddlePaddle.
+
+## PaddlePaddle Model Security
+
+PaddlePaddle attaches great importance to security and privacy of model. This includes how to prevent the model from outputting wrong decision results under the interference when it is used in security-related and safety-critical scenarios, and how to avoid leaking data and privacy information from the model itself, the model gradient or the model inference results.
+
+
+
+[PaddleSleeve](https://github.com/PaddlePaddle/PaddleSleeve) provides a series of security and privacy tools, which can help model developers and users systematically evaluate and improve the model security and privacy in both development and deployment stages.
+
+
+
+These tools include adversarial example evaluation test, pseudo-natural environment robustness evaluation test, model reversing evaluation test, member inference evaluation test, sample denoising, adversarial training, privacy enhancement optimizer, etc.
+
+## PaddlePaddle Code Security
+
+PaddlePaddle always take code security seriously. However, due to the complexity of the framework and its dependence on other thirdparty open source libraries, there may still be some security issues undetected. Therefore, we hope that more security researchers and PaddlePaddle developers can participate in the code security program. We encourage responsible disclosure of security issues, as well as contributing code to improve our vulnerability finding tools to make PaddlePaddle safer.
+
+### Code security tools
+
+PaddlePaddle security team attaches great importance to the security of the framework. In order to find and fix security issues as soon as possible, we are continuously conducting code security audit and developing automatic vunerability discovery tools. We have already open sourced some of them to the community, hoping this could encourage people to contribute and improve the safety and robustness of PaddlePaddle. [This tool](https://github.com/PaddlePaddle/PaddleSleeve/tree/main/CodeSecurity) includes two parts. The dynamic part includes some op fuzzer samples. And the static part includes some CodeQL samples. Both of them are aim to find vulnerabilities in PaddlePaddle framework codebase. By referring the samples, security researchers can write their own fuzzers or QLs to test more PaddlePaddle modules, and find more code security issues.
+
+### Reporting vulnerabilities
+
+We encourage responsible disclosure of security issues to PaddlePaddle and please email reports about any security issues you find to security@paddlepaddle.org.
+
+
+
+After the security team receives your email, they will communicate with you in time. The security team will work to keep you informed of an issue fix.
+
+
+
+In order to reproduce and identify the issue, please include the following information along with your email:
+
+- The details of the vulnerability including how to reproduce it. Try to attach a PoC.
+- The attack scenario and what an attacker might be able to achieve with this issue.
+- Whether this vulnerability has been made public. If it is, please attach details.
+- Your name and affiliation.
+
+We will indicate the bug fix in the release of PaddlePaddle, and publish the vulnerability detail and the reporter in the security advisories (Your name will not be published if you choose to remain anonymous).
+
+### What is a vulnerability?
+
+In the process of computation graphs in PaddlePaddle, models can perform arbitrary computations , including reading and writing files, communicating with the network, etc. It may cause memory exhaustion, deadlock, etc., which will lead to unexpected behavior of PaddlePaddle. We consider these behavior to be security vulnerabilities only if they are out of the intention of the operation involved. 
+
+
+
+Some unexpected parameters and behaviors have been checked in PaddlePaddle by throwing exceptions in Python or return error states in C++. In these cases, denial of service is still possible, but the exit of the PaddlePaddle is clean. Since the error handling of PaddlePaddle is expected and correct, these cases are not security vulnerabilities.
+
+
+
+If malicious input can trigger memory corruption or non-clean exit, such bug is considered a security problem.
+
+
+
+[security advisories](https://github.com/PaddlePaddle/Paddle/security/README.md)
diff --git a/SECURITY_cn.md b/SECURITY_cn.md
new file mode 100644
index 0000000000000..e9f503192c1eb
--- /dev/null
+++ b/SECURITY_cn.md
@@ -0,0 +1,44 @@
+# 安全使用飞桨
+
+
+
+本文将对飞桨模型及代码安全进行介绍，并介绍如何向飞桨提报漏洞。
+
+## 飞桨模型安全
+
+飞桨关注模型的安全性和隐私性。其中包括当模型被用于安全攸关场景时，如何避免模型在干扰下输出错误的决策结果，以及如何避免从模型本身、模型梯度或模型推理结果中泄露数据和隐私信息。
+
+飞桨的安全和隐私套件[PaddleSleeve](https://github.com/PaddlePaddle/PaddleSleeve)提供了一系列工具，可帮助模型开发者及使用者在模型的开发或部署阶段，系统性地评估并提升模型的安全性和隐私性。这些工具包括对抗样本评估测试、拟自然环境鲁棒性评估测试、模型逆向评估测试、成员推断评估测试、样本去噪、对抗训练、隐私增强优化器等。
+
+## 飞桨代码安全
+
+飞桨团队一向非常重视代码安全，但鉴于飞桨框架的实现非常复杂，并且依赖了多个第三方开源库，其中仍可能会存在未被发现的问题。因此，我们希望有更多安全研究人员、飞桨开发者能参与到飞桨代码安全保障项目中来，我们鼓励向飞桨负责任的披露(Responsible Disclosure)安全问题，也鼓励向飞桨贡献代码完善动静态漏洞挖掘工具，让飞桨变得更安全。
+
+### 安全工具
+
+飞桨安全团队对于飞桨框架自身的安全高度重视，为了尽快地发现和修复安全问题，我们内部在持续地进行代码安全审计和研发自动化漏洞挖掘工具。我们将一些工具和方法开源给社区，希望能抛砖引玉，大家一起来贡献提高飞桨的安全性和鲁棒性。工具开源见[CodeSecurity](https://github.com/PaddlePaddle/PaddleSleeve/tree/main/CodeSecurity)。该开源工具包含两部分内容，分别从动态（模糊测试）和静态（CodeQL）两个角度对飞桨代码进行安全审计和漏洞挖掘。通过参照和添加新的测试模块，可以帮助覆盖更多飞桨代码模块，发现更多的代码安全问题。
+
+### 报告安全问题
+
+我们鼓励向飞桨负责任地披露安全问题，请将所发现的安全问题发送电子邮件到 security@paddlepaddle.org。
+
+在安全团队收到邮件后将会及时与您沟通并反馈问题修复进度。
+
+为了更好地复现和认定问题情况，请在邮件中：
+
+- 详细描述漏洞细节，如何复现，并尽量附上PoC。
+- 描述攻击场景，介绍攻击者可能由此问题所能达到的效果。
+- 该问题是否已公开并描述情况。
+- 署名您的姓名和从属关系。
+
+我们会将漏洞修复情况注明在飞桨的发布当中，并在致谢公告中发布漏洞情况和提报人（如果您选择不公开署名将不会发布提报人信息）。
+
+### 安全问题认定说明
+
+飞桨在计算图的过程中，由于模型可以执行任何计算，操作文件，进行网络通信等功能，可能造成内存耗尽，死锁等情况发生，这将导致飞桨产生一些非预期的行为。我们认为只有当这些行为超出了所涉及的操作意图时才算作是安全问题。
+
+飞桨框架代码中对于一些非预期的参数和行为会进行检查，Python代码中以抛出异常为形式，C++代码中以返回错误状态为形式。这些情况下，飞桨代码的退出是干净的，但仍可能会因此造成拒绝服务，然而由于飞桨的处理是预期且正确的，所以造成这些情况并不算作是安全问题。
+
+如果输入非预期的参数后，对飞桨代码造成了内存破坏，或者非干净退出，这类行为被认定为存在安全问题。
+
+### [安全公告](https://github.com/PaddlePaddle/Paddle/security/README_cn.md)
diff --git a/security/README.md b/security/README.md
new file mode 100644
index 0000000000000..ab3dab8c0cc70
--- /dev/null
+++ b/security/README.md
@@ -0,0 +1,12 @@
+# PaddlePaddle Security Advisories
+
+We regularly publish security advisories about using PaddlePaddle.
+
+
+
+*Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](https://github.com/PaddlePaddle/Paddle/SECURITY.md).
+
+
+| Advisory Number | Type | Versions affected | Reported by | Additional Information|
+| --------------- | ---- | :---------------: | ----------- | ----------------------|
+|  |  |  |  |  |
diff --git a/security/README_cn.md b/security/README_cn.md
new file mode 100644
index 0000000000000..2ae23046469d4
--- /dev/null
+++ b/security/README_cn.md
@@ -0,0 +1,12 @@
+# 飞桨安全公告
+
+我们在此定期发布飞桨安全公告。
+
+
+
+注：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](https://github.com/PaddlePaddle/Paddle/SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
+
+
+| 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 |
+| --------------- | ---- | :---------------: | ----------- | ----------------------|
+|  |  |  |  |  |

From 7f73ef2c7304ea3a4d22659ac8701d36e588c4e3 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Thu, 14 Apr 2022 12:46:30 +0800
Subject: [PATCH 18/19] fix bfgs_doc (#41505)

* fix bfgs_doc; test=document_fix

* add parameter name; test=document_fix

* modify according to chenlong's comments;test=document_fix
---
 .../incubate/optimizer/functional/bfgs.py     | 82 +++++++------------
 .../incubate/optimizer/functional/lbfgs.py    | 78 ++++++++----------
 2 files changed, 66 insertions(+), 94 deletions(-)

diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py
index abdab457fda00..23fd8dc0825f0 100644
--- a/python/paddle/incubate/optimizer/functional/bfgs.py
+++ b/python/paddle/incubate/optimizer/functional/bfgs.py
@@ -33,63 +33,43 @@ def minimize_bfgs(objective_func,
                   name=None):
     r"""
     Minimizes a differentiable function `func` using the BFGS method.
-    The BFGS is a quasi-Newton method for solving an unconstrained
-    optimization problem over a differentiable function.
-    Closely related is the Newton method for minimization. Consider the iterate 
-    update formula
+    The BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
+    Closely related is the Newton method for minimization. Consider the iterate update formula:
+
     .. math::
-        x_{k+1} = x_{k} + H \nabla{f},
-    If $H$ is the inverse Hessian of $f$ at $x_{k}$, then it's the Newton method.
-    If $H$ is symmetric and positive definite, used as an approximation of the inverse Hessian, then 
+        x_{k+1} = x_{k} + H_k \nabla{f_k}
+
+    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
+    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then 
     it's a quasi-Newton. In practice, the approximated Hessians are obtained
     by only using the gradients, over either whole or part of the search 
-    history, the former is BFGS.
-
-    Reference:
-        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
-        pp140: Algorithm 6.1 (BFGS Method).
-    
-    Following summarizes the the main logic of the program based on BFGS. Note: _k represents value of
-    k_th iteration, ^T represents the transposition of a vector or matrix.
-    repeat
-        p_k = H_k * g_k
-        alpha = strong_wolfe(f, x_k, p_k)
-        x_k+1 = x_k + alpha * p_k
-        s_k = x_k+1 - x_k
-        y_k = g_k+1 - g_k
-        rho_k = 1 / (s_k^T * y_k)
-        V_k^T = I - rho_k * s_k * y_k^T
-        V_k = I - rho_k * y_k * s_k^T
-        H_k+1 = V_k^T * H_k * V_k + rho_k * s_k * s_k^T
-        check_converge
-    end 
+    history, the former is BFGS, the latter is L-BFGS.
+
+    Reference: 
+        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method).
 
     Args:
-        objective_func: the objective function to minimize. ``func`` accepts
-            a multivariate input and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton 
-        the initial trial step length should always be 1.0.
-        max_iters (int): the maximum number of minimization iterations.
-        tolerance_grad (float): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm.
-        tolerance_change (float): terminates if the change of function value/position/parameter between 
-            two iterations is smaller than this value.
-        initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation at initial_position.
-        It must be symmetric and positive definite.
-        line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support 
-            'Hager Zhang' in the futrue.
-        max_line_search_iters (int): the maximum number of line search iterations.
-        initial_step_length (float): step length used in first iteration of line search. different initial_step_length 
-        may cause different optimal result.
-        dtype ('float32' | 'float64'): In static graph, float64 will be convert to float32 due to paddle.assign limit.
-    
+        objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates. 
+        max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
+        tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
+        tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
+        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None.
+        line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
+        max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
+        initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
+        dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'.
+        name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.
+
     Returns:
-        is_converge (bool): Indicates whether found the minimum within tolerance.
-        num_func_calls (int): number of objective function called.
-        position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of 
-        the objective function regrading to the initial position.
-        objective_value (Tensor): objective function value at the `position`.
-        objective_gradient (Tensor): objective function gradient at the `position`.
-        inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`.
+        output(tuple):
+
+            - is_converge (bool): Indicates whether found the minimum within tolerance.
+            - num_func_calls (int): number of objective function called.
+            - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position.
+            - objective_value (Tensor): objective function value at the `position`.
+            - objective_gradient (Tensor): objective function gradient at the `position`.
+            - inverse_hessian_estimate (Tensor): the estimate of inverse hessian at the `position`.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py
index d4bf511f85a99..f283381597733 100644
--- a/python/paddle/incubate/optimizer/functional/lbfgs.py
+++ b/python/paddle/incubate/optimizer/functional/lbfgs.py
@@ -32,54 +32,46 @@ def minimize_lbfgs(objective_func,
                    initial_step_length=1.0,
                    dtype='float32',
                    name=None):
-    r"""Minimizes a differentiable function `func` using the L-BFGS method.
-    The L-BFGS is simalar as BFGS, the only difference is that L-BFGS use historical
-    sk, yk, rhok rather than H_k-1 to compute Hk.
+    r"""
+    Minimizes a differentiable function `func` using the L-BFGS method.
+    The L-BFGS is a quasi-Newton method for solving an unconstrained optimization problem over a differentiable function.
+    Closely related is the Newton method for minimization. Consider the iterate update formula:
+
+    .. math::
+        x_{k+1} = x_{k} + H_k \nabla{f_k}
+
+    If :math:`H_k` is the inverse Hessian of :math:`f` at :math:`x_k`, then it's the Newton method.
+    If :math:`H_k` is symmetric and positive definite, used as an approximation of the inverse Hessian, then 
+    it's a quasi-Newton. In practice, the approximated Hessians are obtained
+    by only using the gradients, over either whole or part of the search 
+    history, the former is BFGS, the latter is L-BFGS.
+
     Reference:
-        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006.
-        pp179: Algorithm 7.5 (L-BFGS).
-
-    Following summarizes the the main logic of the program based on L-BFGS.Note: _k represents 
-    value of k_th iteration, ^T represents the transposition of a vector or matrix.
-    repeat
-        compute p_k by two-loop recursion
-        alpha = strong_wolfe(f, x_k, p_k)
-        x_k+1 = x_k + alpha * p_k
-        s_k = x_k+1 - x_k
-        y_k = g_k+1 - g_k
-        rho_k = 1 / (s_k^T * y_k)
-        update sk_vec, yk_vec, rhok_vec
-        check_converge
-    end 
+        Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS).
 
     Args:
-        objective_func: the objective function to minimize. ``func`` accepts
-            a multivariate input and returns a scalar.
-        initial_position (Tensor): the starting point of the iterates. For methods like Newton and quasi-Newton 
-        the initial trial step length should always be 1.0 .
-        history_size (Scalar): the number of stored vector pairs {si,yi}.
-        max_iters (Scalar): the maximum number of minimization iterations.
-        tolerance_grad (Scalar): terminates if the gradient norm is smaller than
-            this. Currently gradient norm uses inf norm.
-        tolerance_change (Scalar): terminates if the change of function value/position/parameter between 
-            two iterations is smaller than this value.
-        initial_inverse_hessian_estimate (Tensor): the initial inverse hessian approximation.
-        line_search_fn (str): indicate which line search method to use, only support 'strong wolfe' right now. May support 
-            'Hager Zhang' in the futrue.
-        max_line_search_iters (Scalar): the maximum number of line search iterations.
-        initial_step_length: step length used in first iteration of line search. different initial_step_length 
-        may cause different optimal result.
-        dtype ('float' | 'float32' | 'float64' | 'double'): the data
-            type to be used.
-    
+        objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar.
+        initial_position (Tensor): the starting point of the iterates. 
+        history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100.
+        max_iters (int, optional): the maximum number of minimization iterations. Default value: 50.
+        tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7.
+        tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9.
+        initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None.
+        line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'.
+        max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50.
+        initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0.
+        dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'.
+        name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None.
+
     Returns:
-        is_converge (bool): Indicates whether found the minimum within tolerance.
-        num_func_calls (int): number of objective function called.
-        position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of 
-        the objective function regrading to the initial position.
-        objective_value (Tensor): objective function value at the `position`.
-        objective_gradient (Tensor): objective function gradient at the `position`.
+        output(tuple):
 
+            - is_converge (bool): Indicates whether found the minimum within tolerance.
+            - num_func_calls (int): number of objective function called.
+            - position (Tensor): the position of the last iteration. If the search converged, this value is the argmin of the objective function regrading to the initial position.
+            - objective_value (Tensor): objective function value at the `position`.
+            - objective_gradient (Tensor): objective function gradient at the `position`.
+            
     Examples:
         .. code-block:: python
 

From ad9585b6697f749fae479ad103bb18b549446255 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 14 Apr 2022 13:42:09 +0800
Subject: [PATCH 19/19] [DoubleGrad] Enabled
 test_autograd_functional_dynamic.py under eager mode (#41668)

* [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad

* Fixed elementwise issue

* Addressed CI failures

* [DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode

* [DoubleGrad] Enabled test_autograd_functional_dynamic.py under eager mode

* Enabled more test cases

* Fixed performance issues

* Fixed minor issue
---
 .../final_state_generator/codegen_utils.py    |  13 +-
 paddle/fluid/eager/autograd_meta.h            |   2 +
 paddle/fluid/eager/backward.cc                |  20 +--
 paddle/fluid/eager/grad_node_info.h           |   2 +-
 paddle/fluid/eager/tensor_wrapper.h           |  31 ++--
 paddle/phi/infermeta/backward.h               |   4 +-
 python/paddle/autograd/functional.py          |   4 +
 .../test_autograd_functional_dynamic.py       | 146 ++++++++++++++----
 python/paddle/tensor/linalg.py                |  10 +-
 python/paddle/utils/code_gen/backward.yaml    |  44 ++++++
 10 files changed, 212 insertions(+), 64 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index 6219ecee17f30..96af7dfc4fe65 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -22,9 +22,16 @@
 ### Global Variables ###
 ########################
 ops_to_fill_zero_for_empty_grads = set([
-    "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
-    "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad",
-    "add_triple_grad"
+    "split_grad",
+    "rnn_grad",
+    "matmul_double_grad",
+    "matmul_triple_grad",
+    "sigmoid_double_grad",
+    "sigmoid_triple_grad",
+    "add_double_grad",
+    "add_triple_grad",
+    "multiply_double_grad",
+    "multiply_triple_grad",
 ])
 
 # For API dispatch used at python-level
diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h
index dca76d3b8a0db..2241ccca81ca4 100644
--- a/paddle/fluid/eager/autograd_meta.h
+++ b/paddle/fluid/eager/autograd_meta.h
@@ -107,6 +107,8 @@ class AutogradMeta : public AbstractAutogradMeta {
 
   GradNodeBase* GradNode() const { return grad_node_.get(); }
 
+  void ResetGradNode() { grad_node_.reset(); }
+
   void SetSingleOutRankWithSlot(size_t slot_id, size_t rank) {
     out_slot_id_ = slot_id;
     out_rank_ = rank;
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 3b555eda8fff7..6db606edf6f4c 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -53,7 +53,7 @@ class GeneralGrad {
         auto* target_node = auto_grad_meta->GetMutableGradNode().get();
 
         if (orig_to_copied_node_mapping_.count(target_node)) {
-          target_node = orig_to_copied_node_mapping_[target_node];
+          target_node = orig_to_copied_node_mapping_[target_node].get();
         } else {
           VLOG(6) << "Unable to find target node in "
                      "orig_to_copied_node_mapping_, likely indicating an "
@@ -261,7 +261,7 @@ class GeneralGrad {
 
       auto* target_node = auto_grad_meta->GetMutableGradNode().get();
       if (orig_to_copied_node_mapping_.count(target_node)) {
-        target_node = orig_to_copied_node_mapping_[target_node];
+        target_node = orig_to_copied_node_mapping_[target_node].get();
       } else {
         VLOG(6) << "Unable to find target node in "
                    "orig_to_copied_node_mapping_, likely indicating an unused "
@@ -349,12 +349,12 @@ class GeneralGrad {
 
   GradNodeBase* CopyGradNode(const std::shared_ptr<GradNodeBase>& orig_node) {
     if (orig_to_copied_node_mapping_.count(orig_node.get())) {
-      return orig_to_copied_node_mapping_[orig_node.get()];
+      return orig_to_copied_node_mapping_[orig_node.get()].get();
     }
     std::shared_ptr<GradNodeBase> copied_node = orig_node->Copy();
 
     // Save node and update mapping
-    orig_to_copied_node_mapping_[orig_node.get()] = copied_node.get();
+    orig_to_copied_node_mapping_[orig_node.get()] = copied_node;
     copied_grad_nodes_.push_back(copied_node);
 
     return copied_node.get();
@@ -379,7 +379,7 @@ class GeneralGrad {
           paddle::platform::errors::Fatal(
               "Cannot reconstruct backward graph,"
               "unable to find copied target for certain grad node."));
-      GradNodeBase* copied_node = orig_to_copied_node_mapping_[orig_node];
+      GradNodeBase* copied_node = orig_to_copied_node_mapping_[orig_node].get();
 
       const std::vector<std::vector<Edge>>& orig_edges = orig_node->GetEdges();
       std::vector<std::vector<Edge>>& copied_edges =
@@ -397,13 +397,12 @@ class GeneralGrad {
           std::shared_ptr<GradNodeBase> copied_next_node;
           if (orig_to_copied_node_mapping_.count(orig_next_node.get())) {
             copied_next_node =
-                orig_to_copied_node_mapping_[orig_next_node.get()]
-                    ->shared_from_this();
+                orig_to_copied_node_mapping_[orig_next_node.get()];
 
           } else {
             copied_next_node = orig_next_node->Copy();
             orig_to_copied_node_mapping_[orig_next_node.get()] =
-                copied_next_node.get();
+                copied_next_node;
             copied_grad_nodes_.push_back(copied_next_node);
           }
 
@@ -436,7 +435,8 @@ class GeneralGrad {
   std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
 
   std::vector<std::shared_ptr<GradNodeBase>> copied_grad_nodes_;
-  std::unordered_map<GradNodeBase*, GradNodeBase*> orig_to_copied_node_mapping_;
+  std::unordered_map<GradNodeBase*, std::shared_ptr<GradNodeBase>>
+      orig_to_copied_node_mapping_;
 
   DISABLE_COPY_AND_ASSIGN(GeneralGrad);
 };
@@ -534,6 +534,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
   // GeneralGrad
   bool is_general_grad = !inputs.empty();
+  if (is_general_grad) GeneralGrad::Instance().Clear();
 
   /* --- Initialization --- */
   // 1. Init queue with starting nodes
@@ -746,6 +747,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
           VLOG(6) << "We get grad_output_tensor with slot: " << i
                   << ", rank: " << j << " as uninitialized or undefined tensor";
         }
+
         VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
                 << ", rank: " << j
                 << " 's name is: " << grad_output_tensor.name();
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index decb682bf4517..201aae294f928 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -87,7 +87,7 @@ class GradSlotMeta {
   std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
 };
 
-class GradNodeBase : public std::enable_shared_from_this<GradNodeBase> {
+class GradNodeBase {
  public:
   GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index b5dd6b960b23a..e42e04a266b46 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -79,9 +79,9 @@ class TensorWrapper {
 
     auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
     if (tensor_autograd_meta) {
-      auto autograd_meta = std::make_shared<AutogradMeta>(
-          Edge(nullptr, EagerUtils::OutRankInfo(tensor)));
-      autograd_meta->SetStopGradient(tensor_autograd_meta->StopGradient());
+      auto autograd_meta =
+          std::make_shared<AutogradMeta>(*tensor_autograd_meta);
+      autograd_meta->ResetGradNode();
       intermidiate_tensor_.set_autograd_meta(autograd_meta);
       weak_grad_node_ = tensor_autograd_meta->GetMutableGradNode();
     }
@@ -98,8 +98,11 @@ class TensorWrapper {
     check_inplace_version();
 
     // if it's full_reserved just return the full copy of tensor
-    paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
-    if (!full_reserved_) {
+    if (full_reserved_) {
+      return intermidiate_tensor_;
+    } else {
+      paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+
       std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
       if (new_grad_node) {
         VLOG(3) << "Recovered TensorWrapper with GradNode "
@@ -109,17 +112,15 @@ class TensorWrapper {
       }
       auto* intermediate_autograd_meta =
           EagerUtils::unsafe_autograd_meta(intermidiate_tensor_);
-      auto p_ab_autograd_meta = std::make_shared<AutogradMeta>(
-          Edge(new_grad_node, intermediate_autograd_meta->OutRankInfo()));
-      p_ab_autograd_meta->SetStopGradient(
-          intermediate_autograd_meta->StopGradient());
-
-      recovered_tensor.set_autograd_meta(
-          std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
-              p_ab_autograd_meta));
-    }
+      auto p_ab_autograd_meta =
+          std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
+      if (new_grad_node) {
+        p_ab_autograd_meta->SetGradNode(new_grad_node);
+      }
+      recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
 
-    return recovered_tensor;
+      return recovered_tensor;
+    }
   }
 
   void check_inplace_version() {
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index 6e730c83d1d50..c51708bb54394 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -100,6 +100,8 @@ void GatherNdGradInferMeta(const MetaTensor& x,
                            const MetaTensor& out_grad,
                            MetaTensor* x_grad);
 
+void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx);
+
 void GeneralBinaryGradInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 MetaTensor* dx,
@@ -132,8 +134,6 @@ void GeneralQuinaryGradInferMeta(const MetaTensor& x,
                                  MetaTensor* dk,
                                  MetaTensor* dl);
 
-void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx);
-
 void GumbelSoftmaxGradInferMeta(const MetaTensor& out,
                                 const MetaTensor& dout,
                                 int axis,
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
index 8e027c270b700..93142c9112fd0 100644
--- a/python/paddle/autograd/functional.py
+++ b/python/paddle/autograd/functional.py
@@ -943,8 +943,10 @@ def func(x, y):
             #        [0., 1., 0., 1., 0., 1., 0., 1.]]))
 
     '''
+
     inputs = _as_tensors(inputs)
     outputs = _as_tensors(func(*inputs))
+
     batch_size = inputs[0].shape[0]
     for input in inputs:
         assert input.shape[
@@ -961,12 +963,14 @@ def func(x, y):
     for i, flat_output in enumerate(flat_outputs):
         jac_i = list([] for _ in range(fin_size))
         for k in range(flat_output.shape[1]):
+
             row_k = paddle.grad(
                 flat_output[:, k],
                 inputs,
                 create_graph=create_graph,
                 retain_graph=True,
                 allow_unused=allow_unused)
+
             for j in range(fin_size):
                 jac_i[j].append(
                     paddle.reshape(
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
index 8c725fe24e59c..40aead9076569 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py
@@ -205,7 +205,7 @@ def func_vjp_aliased_input(self):
         self.check_results(ref_result, aliased_result)
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
             self.func_vjp_i1o1()
             self.func_vjp_i2o1()
             self.func_vjp_i2o2()
@@ -213,6 +213,13 @@ def test_all_cases(self):
             self.func_vjp_nested()
             self.func_vjp_aliased_input()
 
+        self.func_vjp_i1o1()
+        self.func_vjp_i2o1()
+        self.func_vjp_i2o2()
+        self.func_vjp_i2o2_omitting_v()
+        self.func_vjp_nested()
+        self.func_vjp_aliased_input()
+
 
 @utils.place(config.DEVICES)
 @utils.parameterize(
@@ -227,8 +234,9 @@ def func_vjp(self):
                                 paddle.to_tensor(self.v))
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
             self.func_vjp()
+        self.func_vjp()
 
 
 def jac(grad_fn, f, inputs):
@@ -303,11 +311,15 @@ def func_jvp_i2o2_omitting_v(self):
             self.check_results(results_omitting_v, results_with_v)
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
             self.func_jvp_i1o1()
             self.func_jvp_i2o1()
             self.func_jvp_i2o2()
             self.func_jvp_i2o2_omitting_v()
+        self.func_jvp_i1o1()
+        self.func_jvp_i2o1()
+        self.func_jvp_i2o2()
+        self.func_jvp_i2o2_omitting_v()
 
 
 @utils.place(config.DEVICES)
@@ -328,12 +340,12 @@ def setUp(self):
         self._atol = config.TOLERANCE.get(str(self._dtype)).get(
             "first_order_grad").get("atol")
 
-        self.xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
+    def func_jacobian(self):
+        xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
             self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
-        self._actual = paddle.autograd.Jacobian(self.func, self.xs, False)
-        self._expected = self._expected()
+        self._actual = paddle.autograd.Jacobian(self.func, xs, False)
+        self._expected = self._get_expected()
 
-    def func_jacobian(self):
         Index = collections.namedtuple('Index', ('type', 'value'))
         indexes = (Index('all', (slice(0, None, None), slice(0, None, None))),
                    Index('row', (0, slice(0, None, None))),
@@ -349,14 +361,17 @@ def func_jacobian(self):
                 err_msg=f'Testcase {index.type} index not passed, value is {index.value}'
             )
 
-    def _expected(self):
-        jac = utils._compute_numerical_jacobian(self.func, self.xs, self._eps,
+    def _get_expected(self):
+        xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
+            self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
+        jac = utils._compute_numerical_jacobian(self.func, xs, self._eps,
                                                 self._dtype)
         return utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NM)
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
             self.func_jacobian()
+        self.func_jacobian()
 
 
 @utils.place(config.DEVICES)
@@ -375,12 +390,12 @@ def setUp(self):
         self._atol = config.TOLERANCE.get(str(self._dtype)).get(
             "first_order_grad").get("atol")
 
-        self.xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
+    def func_jacobian(self):
+        xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
             self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
-        self._actual = paddle.autograd.Jacobian(self.func, self.xs, True)
-        self._expected = self._expected()
+        self._actual = paddle.autograd.Jacobian(self.func, xs, True)
+        self._expected = self._get_expected()
 
-    def func_jacobian(self):
         Index = collections.namedtuple('Index', ('type', 'value'))
         indexes = (
             Index('all', (slice(0, None, None), slice(0, None, None),
@@ -402,16 +417,19 @@ def func_jacobian(self):
                 err_msg=f'Testcase {index.type} index not passed, value is {index.value}'
             )
 
-    def _expected(self):
-        jac = utils._compute_numerical_batch_jacobian(
-            self.func, self.xs, self._eps, self._dtype, False)
+    def _get_expected(self):
+        xs = [paddle.to_tensor(x) for x in self.xs] if isinstance(
+            self.xs, typing.Sequence) else paddle.to_tensor(self.xs)
+        jac = utils._compute_numerical_batch_jacobian(self.func, xs, self._eps,
+                                                      self._dtype, False)
         jac = utils._np_concat_matrix_sequence(jac, utils.MatrixFormat.NBM)
         return utils._np_transpose_matrix_format(jac, utils.MatrixFormat.NBM,
                                                  utils.MatrixFormat.BNM)
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
             self.func_jacobian()
+        self.func_jacobian()
 
 
 class TestHessianClassNoBatch(unittest.TestCase):
@@ -492,12 +510,19 @@ def func(x):
             paddle.autograd.Hessian(func, paddle.ones([3]))
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_single_input()
             self.func_multi_input()
             self.func_allow_unused_true()
             self.func_create_graph_true()
             self.func_out_not_single()
+        self.setUpClass()
+        self.func_single_input()
+        self.func_multi_input()
+        self.func_allow_unused_true()
+        self.func_create_graph_true()
+        self.func_out_not_single()
 
 
 class TestHessianClassBatchFirst(unittest.TestCase):
@@ -599,12 +624,19 @@ def func(x):
             paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True)
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_single_input()
             self.func_multi_input()
             self.func_allow_unused()
             self.func_stop_gradient()
             self.func_out_not_single()
+        self.setUpClass()
+        self.func_single_input()
+        self.func_multi_input()
+        self.func_allow_unused()
+        self.func_stop_gradient()
+        self.func_out_not_single()
 
 
 class TestHessian(unittest.TestCase):
@@ -619,6 +651,7 @@ def setUpClass(self):
             "second_order_grad").get("rtol")
         self.atol = config.TOLERANCE.get(self.dtype).get(
             "second_order_grad").get("atol")
+
         self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
         self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
 
@@ -694,9 +727,10 @@ def func(x):
                                    self.rtol, self.atol)
         try:
             paddle.grad(hessian, self.x)
-        except RuntimeError as e:
+        except Exception as e:
             error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0
+            assert error_msg.find("has no gradient") > 0 or error_msg.find(
+                "does not appear") > 0
 
     def func_create_graph_true(self):
         def func(x):
@@ -713,13 +747,21 @@ def func(x):
         assert triple_grad is not None
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_single_input()
             self.func_multi_input()
             self.func_allow_unused_false()
             self.func_allow_unused_true()
             self.func_create_graph_false()
             self.func_create_graph_true()
+        self.setUpClass()
+        self.func_single_input()
+        self.func_multi_input()
+        self.func_allow_unused_false()
+        self.func_allow_unused_true()
+        self.func_create_graph_false()
+        self.func_create_graph_true()
 
 
 class TestHessianFloat64(TestHessian):
@@ -830,9 +872,10 @@ def func(x):
                                    self.rtol, self.atol)
         try:
             paddle.grad(hessian, self.x)
-        except RuntimeError as e:
+        except Exception as e:
             error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0
+            assert error_msg.find("has no gradient") > 0 or error_msg.find(
+                "does not appear") > 0
 
     def func_create_graph_true(self):
         def func(x):
@@ -849,13 +892,21 @@ def func(x):
         assert triple_grad is not None
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_single_input()
             self.func_multi_input()
             self.func_allow_unused_false()
             self.func_allow_unused_true()
             self.func_create_graph_false()
             self.func_create_graph_true()
+        self.setUpClass()
+        self.func_single_input()
+        self.func_multi_input()
+        self.func_allow_unused_false()
+        self.func_allow_unused_true()
+        self.func_create_graph_false()
+        self.func_create_graph_true()
 
 
 class TestBatchHessianFloat64(TestBatchHessian):
@@ -985,12 +1036,19 @@ def func(x):
         assert triple_grad is not None
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_v_default()
             self.func_multi_input()
             self.func_single_input()
             self.func_allow_unused_true()
             self.func_create_graph_true()
+        self.setUpClass()
+        self.func_v_default()
+        self.func_multi_input()
+        self.func_single_input()
+        self.func_allow_unused_true()
+        self.func_create_graph_true()
 
 
 class TestJacobian(unittest.TestCase):
@@ -1100,9 +1158,10 @@ def func(x, y):
                                        self.atol)
         try:
             paddle.grad(jacobian[0], [self.x, self.y])
-        except RuntimeError as e:
+        except Exception as e:
             error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0
+            assert error_msg.find("has no gradient") > 0 or error_msg.find(
+                "does not appear") > 0
 
     def func_create_graph_true(self):
         def func(x, y):
@@ -1123,7 +1182,8 @@ def func(x, y):
         assert double_grad is not None
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_multi_input_and_multi_output()
             self.func_multi_input_and_single_output()
             self.func_single_input_and_multi_output()
@@ -1132,6 +1192,15 @@ def test_all_cases(self):
             self.func_allow_unused_true()
             self.func_create_graph_false()
             self.func_create_graph_true()
+        self.setUpClass()
+        self.func_multi_input_and_multi_output()
+        self.func_multi_input_and_single_output()
+        self.func_single_input_and_multi_output()
+        self.func_single_input_and_single_output()
+        self.func_allow_unused_false()
+        self.func_allow_unused_true()
+        self.func_create_graph_false()
+        self.func_create_graph_true()
 
 
 class TestJacobianFloat64(TestJacobian):
@@ -1269,9 +1338,10 @@ def func(x, y):
                                        self.atol)
         try:
             paddle.grad(jacobian[0], [self.x, self.y])
-        except RuntimeError as e:
+        except Exception as e:
             error_msg = cpt.get_exception_message(e)
-            assert error_msg.find("has no gradient") > 0
+            assert error_msg.find("has no gradient") > 0 or error_msg.find(
+                "does not appear") > 0
 
     def func_create_graph_true(self):
         def func(x, y):
@@ -1292,7 +1362,8 @@ def func(x, y):
         assert double_grad is not None
 
     def test_all_cases(self):
-        if _in_legacy_dygraph():
+        with _test_eager_guard():
+            self.setUpClass()
             self.func_batch_single_input_and_batch_single_output()
             self.func_batch_single_input_and_batch_multi_output()
             self.func_batch_multi_input_and_batch_single_output()
@@ -1301,6 +1372,15 @@ def test_all_cases(self):
             self.func_allow_unused_true()
             self.func_create_graph_false()
             self.func_create_graph_true()
+        self.setUpClass()
+        self.func_batch_single_input_and_batch_single_output()
+        self.func_batch_single_input_and_batch_multi_output()
+        self.func_batch_multi_input_and_batch_single_output()
+        self.func_batch_multi_input_and_batch_multi_output()
+        self.func_allow_unused_false()
+        self.func_allow_unused_true()
+        self.func_create_graph_false()
+        self.func_create_graph_true()
 
 
 class TestJacobianBatchFloat64(TestJacobianBatch):
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 4af4ac52209ef..9c2074bbe3cda 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -1195,7 +1195,15 @@ def t(input, name=None):
             "Input(input) only support N-D (N<=2) tensor, but received "
             "length of Input(input) is %s. Perhaps you can use paddle."
             "tensor.transpose() instead." % len(input.shape))
-    if paddle.in_dynamic_mode():
+    if in_dygraph_mode():
+        if len(input.shape) == 1:
+            return input
+        # 2-D tensor
+        perm = [1, 0]
+        out = _C_ops.final_state_transpose(input, perm)
+        return out
+
+    if _in_legacy_dygraph():
         if len(input.shape) == 1:
             return input
         # 2-D tensor
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 97c9c7ddf1584..a7b29b9f5aefc 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -1097,6 +1097,7 @@
   kernel :
     func : multiply_double_grad
   optional : grad_x_grad, grad_y_grad
+  backward : multiply_triple_grad
 
 - backward_api : multiply_grad
   forward : multiply (Tensor x, Tensor y) -> Tensor(out)
@@ -1109,6 +1110,17 @@
     func : multiply_grad
   backward : multiply_double_grad
 
+- backward_api : multiply_triple_grad
+  forward : multiply_double_grad (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, int aixs = -1) -> Tensor(grad_x), Tensor(grad_y), Tensor(grad_grad_out)
+  args : (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, Tensor grad_x_grad, Tensor grad_y_grad, Tensor grad_grad_out_grad, int axis = -1)
+  output : Tensor(x_grad), Tensor(y_grad), Tensor(fwd_grad_out_grad), Tensor(fwd_grad_grad_x_grad), Tensor(fwd_grad_grad_y_grad)
+  infer_meta :
+    func : GeneralQuinaryGradInferMeta
+    param : [x, y, fwd_grad_out, x, y]
+  kernel :
+    func : multiply_triple_grad
+  optional : fwd_grad_grad_x, fwd_grad_grad_y, grad_grad_out_grad
+
 - backward_api : mv_grad
   forward : mv (Tensor x, Tensor vec) -> Tensor(out)
   args : (Tensor x, Tensor vec, Tensor out_grad)
@@ -1286,6 +1298,16 @@
     func : relu_grad
   backward: relu_double_grad
 
+- backward_api : reshape_double_grad
+  forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x)
+  args : (Tensor grad_out, Tensor grad_x_grad)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [grad_out]
+  kernel :
+    func : reshape_double_grad
+
 - backward_api : reshape_grad
   forward : reshape_with_xshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad)
@@ -1299,6 +1321,7 @@
     data_type: out_grad
     backend: out_grad
     layout: out_grad
+  backward : reshape_double_grad
 
 - backward_api : roi_align_grad
   forward : roi_align (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned) -> Tensor(out)
@@ -1592,6 +1615,13 @@
     func : subtract_grad
   no_need_buffer : x, y
 
+- backward_api : sum_double_grad
+  forward : sum_grad (Tensor x, Tensor grad_out, int64_t[] dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int64_t[] dims={}, bool keep_dim=false)
+  output : Tensor(grad_out_grad)
+  invoke : sum(grad_x_grad, dims, grad_x_grad.dtype(), keep_dim)
+  backward : sum_triple_grad
+
 - backward_api : sum_grad
   forward : sum (Tensor x, int64_t[] dims={}, DataType out_dtype=paddle::experimental::DataType::UNDEFINED, bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int64_t[] dims, bool keep_dim, bool reduce_all=false)
@@ -1601,6 +1631,13 @@
     param : [x]
   kernel :
     func : sum_grad
+  backward : sum_double_grad
+
+- backward_api : sum_triple_grad
+  forward : sum_double_grad (Tensor grad_grad_x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(grad_grad_out)
+  args : (Tensor grad_grad_x, Tensor grad_grad_out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false)
+  output : Tensor(grad_grad_x_grad)
+  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all)
   no_need_buffer : x
 
 - backward_api : swish_grad
@@ -1695,6 +1732,12 @@
     func : trace_grad
   no_need_buffer : x
 
+- backward_api : transpose_double_grad
+  forward : transpose_grad (Tensor grad_out, int[] axis) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, int[] axis)
+  output : Tensor(grad_out_grad)
+  invoke : transpose(grad_x_grad, axis)
+
 - backward_api : transpose_grad
   forward : transpose (Tensor x, int[] axis) -> Tensor(out)
   args : (Tensor out_grad, int[] axis)
@@ -1704,6 +1747,7 @@
     param : [out_grad, axis]
   kernel :
     func : transpose_grad
+  backward : transpose_double_grad
 
 - backward_api : triangular_solve_grad
   forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out)