diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bf9aa6e915a46..dfe1e3848ee5a 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -83,65 +83,34 @@ repos:
         # exclude files which need to be fixed
         exclude: |
             (?x)^(
-                cmake/generic.cmake|
                 CMakeLists.txt|
-                paddle/fluid/pybind/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/CMakeLists.txt|
-                paddle/fluid/eager/auto_code_generator/CMakeLists.txt|
-                paddle/fluid/framework/CMakeLists.txt|
-                paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt|
-                cmake/third_party.cmake|
+                python/paddle/fluid/tests/unittests/CMakeLists.txt|             
                 paddle/fluid/inference/tests/infer_ut/CMakeLists.txt|
                 cmake/configure.cmake|
                 paddle/fluid/inference/api/demo_ci/CMakeLists.txt|
                 cmake/flags.cmake|
                 cmake/inference_lib.cmake|
                 cmake/external/protobuf.cmake|
-                cmake/system.cmake|
-                cmake/cudnn.cmake|
-                cmake/external/mkldnn.cmake|
-                cmake/unity_build.cmake|
                 paddle/fluid/framework/fleet/CMakeLists.txt|
                 paddle/fluid/inference/CMakeLists.txt|
                 paddle/fluid/inference/tests/api/CMakeLists.txt|
                 paddle/fluid/operators/CMakeLists.txt|
-                paddle/phi/api/lib/CMakeLists.txt|
-                cmake/external/gflags.cmake|
                 cmake/external/lite.cmake|
                 cmake/external/poplar.cmake|
                 cmake/python_module.cmake|
                 python/paddle/fluid/tests/unittests/asp/CMakeLists.txt|
                 cmake/cuda.cmake|
                 cmake/FindNumPy.cmake|
-                cmake/phi.cmake|
-                paddle/fluid/framework/ir/CMakeLists.txt|
-                paddle/fluid/platform/CMakeLists.txt|
-                python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt|
-                python/paddle/tests/CMakeLists.txt|
-                cmake/ccache.cmake|
                 cmake/coveralls.cmake|
                 cmake/external/glog.cmake|
                 cmake/external/onnxruntime.cmake|
                 cmake/external/openblas.cmake|
                 cmake/external/xpu.cmake|
                 cmake/hip.cmake|
-                paddle/fluid/distributed/CMakeLists.txt|
-                paddle/fluid/framework/details/CMakeLists.txt|
-                paddle/fluid/imperative/CMakeLists.txt|
                 paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt|
                 paddle/fluid/inference/api/CMakeLists.txt|
                 paddle/fluid/operators/controlflow/CMakeLists.txt|
                 python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt|
-                cmake/cblas.cmake|
-                cmake/coverallsGcovJsons.cmake|
-                cmake/external/brpc.cmake|
-                cmake/external/cryptopp.cmake|
-                cmake/external/gtest.cmake|
-                cmake/external/llvm.cmake|
-                cmake/external/utf8proc.cmake|
-                cmake/external/warpctc.cmake|
-                cmake/external/zlib.cmake|
-                cmake/FindGperftools.cmake|
                 cmake/operators.cmake|
                 cmake/tensorrt.cmake|
                 paddle/fluid/inference/api/details/CMakeLists.txt|
@@ -154,28 +123,13 @@ repos:
                 cmake/miopen.cmake|
                 cmake/nccl.cmake|
                 cmake/simd.cmake|
-                paddle/fluid/distributed/fleet_executor/CMakeLists.txt|
-                paddle/fluid/eager/api/generated/fluid_generated/forwards/CMakeLists.txt|
-                paddle/fluid/framework/io/CMakeLists.txt|
-                paddle/fluid/imperative/tests/CMakeLists.txt|
                 paddle/fluid/inference/analysis/CMakeLists.txt|
                 paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake|
                 paddle/fluid/memory/allocation/CMakeLists.txt|
                 paddle/fluid/memory/CMakeLists.txt|
                 paddle/fluid/operators/cinn/CMakeLists.txt|
-                paddle/fluid/operators/collective/CMakeLists.txt|
-                paddle/fluid/operators/ipu/CMakeLists.txt|
-                paddle/fluid/operators/jit/CMakeLists.txt|
-                paddle/fluid/operators/pscore/CMakeLists.txt|
-                paddle/fluid/platform/device/ipu/CMakeLists.txt|
-                paddle/fluid/platform/dynload/CMakeLists.txt|
                 paddle/infrt/external_kernels/CMakeLists.txt|
                 paddle/infrt/kernel/phi/CMakeLists.txt|
-                paddle/phi/backends/dynload/CMakeLists.txt|
-                paddle/phi/CMakeLists.txt|
-                paddle/phi/kernels/CMakeLists.txt|
-                paddle/phi/tests/core/CMakeLists.txt|
-                python/CMakeLists.txt|
                 python/paddle/fluid/contrib/slim/tests/CMakeLists.txt|
                 python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt|
                 python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt|
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ba438a74718f2..a3e0b64e97b25 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -60,6 +60,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
 option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
+option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic) # simplify cmake module
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
index 85bc0e987a6b6..08b6720416fe2 100644
--- a/cmake/ccache.cmake
+++ b/cmake/ccache.cmake
@@ -13,7 +13,7 @@ if(NOT WIN32)
             ${cache_directory})
     set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
     set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
-  endif(CCACHE_PATH)
+  endif()
 elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
   # (Note:zhouwei25) Only Ninja Generator can support sccache now
   find_program(SCCACHE_PATH sccache)
@@ -30,5 +30,5 @@ elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
     # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit
     # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it
     set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH})
-  endif(SCCACHE_PATH)
+  endif()
 endif()
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index 6c1186f69f14d..c31b2457c1742 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -141,9 +141,9 @@ file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
 # Get only the filenames to use for filtering.
 #set(COVERAGE_SRCS_NAMES "")
 #foreach (COVSRC ${COVERAGE_SRCS})
-#	get_filename_component(COVSRC_NAME ${COVSRC} NAME)
-#	message("${COVSRC} -> ${COVSRC_NAME}")
-#	list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
+# get_filename_component(COVSRC_NAME ${COVSRC} NAME)
+# message("${COVSRC} -> ${COVSRC_NAME}")
+# list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
 #endforeach()
 
 #
@@ -155,15 +155,15 @@ file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
 #
 # Example:
 # COVERAGE_SRCS =
-#				/path/to/project/root/subdir/the_file.c
+#       /path/to/project/root/subdir/the_file.c
 #
 # ALL_GCOV_FILES =
-#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
-#				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
+#       /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#       /path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
 #
 # Result should be:
 # GCOV_FILES =
-#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#       /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
 #
 set(GCOV_FILES "")
 #message("Look in coverage sources: ${COVERAGE_SRCS}")
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 2e5131d217a50..34cc9c8199b37 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -4,11 +4,11 @@ endif()
 
 if(WIN32)
   set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
-else(WIN32)
+else()
   set(CUDNN_ROOT
       "/usr"
       CACHE PATH "CUDNN ROOT")
-endif(WIN32)
+endif()
 
 find_path(
   CUDNN_INCLUDE_DIR cudnn.h
@@ -41,16 +41,16 @@ set(CUDNN_LIB_NAME "")
 
 if(LINUX)
   set(CUDNN_LIB_NAME "libcudnn.so")
-endif(LINUX)
+endif()
 
 if(WIN32)
   # only support cudnn7
   set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
-endif(WIN32)
+endif()
 
 if(APPLE)
   set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
-endif(APPLE)
+endif()
 
 find_library(
   CUDNN_LIBRARY
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index ff4d3b5c9ea9e..9daa4be7468e4 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -35,11 +35,11 @@ if(WIN32)
         ${CMAKE_COMMAND} -E copy_if_different
         "${PADDLE_SOURCE_DIR}/patches/cryptopp/CMakeLists.txt" "<SOURCE_DIR>/")
   endif()
-else(WIN32)
+else()
   set(CRYPTOPP_LIBRARIES
       "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a"
       CACHE FILEPATH "cryptopp library." FORCE)
-endif(WIN32)
+endif()
 
 if(APPLE AND WITH_ARM)
   set(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0")
diff --git a/cmake/external/cusparselt.cmake b/cmake/external/cusparselt.cmake
new file mode 100644
index 0000000000000..8ab1275cb62f0
--- /dev/null
+++ b/cmake/external/cusparselt.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT (WITH_CUSPARSELT AND WITH_TENSORRT))
+  return()
+endif()
+
+if(WITH_ARM OR WIN32)
+  message(SEND_ERROR "The current sparselt support linux only")
+  return()
+endif()
+
+include(ExternalProject)
+
+set(CUSPARSELT_PROJECT "extern_cusparselt")
+set(CUSPARSELT_P "https://developer.download.nvidia.com/compute")
+set(CUSPARSELT_F "libcusparse_lt-linux-x86_64-0.2.0.1.tar.gz")
+set(CUSPARSELT_URL
+    "${CUSPARSELT_P}/libcusparse-lt/0.2.0/local_installers/${CUSPARSELT_F}"
+    CACHE STRING "" FORCE)
+set(CUSPARSELT_PREFIX_DIR ${THIRD_PARTY_PATH}/cusparselt)
+set(CUSPARSELT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cusparselt)
+set(CUSPARSELT_INC_DIR
+    "${CUSPARSELT_INSTALL_DIR}/include"
+    CACHE PATH "sparselt include directory." FORCE)
+set(CUSPARSELT_LIB_DIR
+    "${CUSPARSELT_INSTALL_DIR}/lib64"
+    CACHE PATH "sparselt lib directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
+include_directories(${CUSPARSELT_INC_DIR})
+
+ExternalProject_Add(
+  ${CUSPARSELT_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${CUSPARSELT_URL}
+  PREFIX ${CUSPARSELT_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND
+    ${CMAKE_COMMAND} -E copy_directory
+    ${CUSPARSELT_PREFIX_DIR}/src/extern_cusparselt/lib64 ${CUSPARSELT_LIB_DIR}
+    && ${CMAKE_COMMAND} -E copy_directory
+    ${CUSPARSELT_PREFIX_DIR}/src/extern_cusparselt/include ${CUSPARSELT_INC_DIR}
+  UPDATE_COMMAND "")
+
+add_library(cusparselt INTERFACE)
+add_dependencies(cusparselt ${CUSPARSELT_PROJECT})
+set(CUSPARSELT_FOUND ON)
+add_definitions(-DPADDLE_WITH_CUSPARSELT)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 783e1c0d442f7..68255b4d60057 100755
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -25,13 +25,13 @@ if(WIN32)
   set(GFLAGS_LIBRARIES
       "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib"
       CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
-else(WIN32)
+else()
   set(GFLAGS_LIBRARIES
       "${GFLAGS_INSTALL_DIR}/lib/libgflags.a"
       CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
   set(BUILD_COMMAND $(MAKE) --silent)
   set(INSTALL_COMMAND $(MAKE) install)
-endif(WIN32)
+endif()
 
 include_directories(${GFLAGS_INCLUDE_DIR})
 
@@ -101,5 +101,5 @@ if(WIN32)
   check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI)
   if(HAVE_SHLWAPI)
     set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib)
-  endif(HAVE_SHLWAPI)
-endif(WIN32)
+  endif()
+endif()
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 00527ceecdc1f..3833a6ca868c7 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -42,7 +42,7 @@ if(WIN32)
   string(REPLACE "/w " "" GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "/W0 " "" GTEST_CMAKE_C_FLAGS "${GTEST_CMAKE_C_FLAGS}")
   string(REPLACE "/W0 " "" GTEST_CMAKE_CXX_FLAGS "${GTEST_CMAKE_CXX_FLAGS}")
-else(WIN32)
+else()
   set(GTEST_LIBRARIES
       "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a"
       CACHE FILEPATH "gtest libraries." FORCE)
@@ -51,7 +51,7 @@ else(WIN32)
       CACHE FILEPATH "gtest main libraries." FORCE)
   set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
   set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-endif(WIN32)
+endif()
 
 if(WITH_MKLML)
   # wait for mklml downloading completed
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index dfa20dd631fc6..909d223060f52 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -59,7 +59,7 @@ else()
   set(MKLDNN_LIB
       "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib"
       CACHE FILEPATH "mkldnn library." FORCE)
-endif(NOT WIN32)
+endif()
 
 ExternalProject_Add(
   ${MKLDNN_PROJECT}
@@ -121,7 +121,7 @@ if(WIN32)
     DEPENDS ${MKLDNN_PROJECT}
     VERBATIM)
   add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB})
-else(WIN32)
+else()
   set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
   set(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1)
   set(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2)
@@ -132,7 +132,7 @@ else(WIN32)
     COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2}
     DEPENDS ${MKLDNN_PROJECT})
   add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2})
-endif(WIN32)
+endif()
 
 # generate a static dummy target to track mkldnn dependencies
 # for cc_library(xxx SRCS xxx.c DEPS mkldnn)
diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake
index 13107c03cf171..753c22fe2655e 100644
--- a/cmake/external/utf8proc.cmake
+++ b/cmake/external/utf8proc.cmake
@@ -23,9 +23,9 @@ set(UTF8PROC_TAG v2.6.1)
 if(WIN32)
   set(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib")
   add_definitions(-DUTF8PROC_STATIC)
-else(WIN32)
+else()
   set(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a")
-endif(WIN32)
+endif()
 
 include_directories(${UTF8PROC_INSTALL_DIR}/include)
 
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index d38636c9c23a8..c7a4e1d99bff1 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -37,11 +37,11 @@ if(WIN32)
   set(WARPCTC_LIBRARIES
       "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
-else(WIN32)
+else()
   set(WARPCTC_LIBRARIES
       "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
-endif(WIN32)
+endif()
 
 if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
    OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 2cef053e32547..c6ad1506bbe48 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -34,11 +34,11 @@ if(WIN32)
   set(ZLIB_LIBRARIES
       "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib"
       CACHE FILEPATH "zlib library." FORCE)
-else(WIN32)
+else()
   set(ZLIB_LIBRARIES
       "${ZLIB_INSTALL_DIR}/lib/libz.a"
       CACHE FILEPATH "zlib library." FORCE)
-endif(WIN32)
+endif()
 
 ExternalProject_Add(
   extern_zlib
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a6a7ab983b9f6..838fbd71c25cd 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,7 +116,7 @@ function(find_fluid_modules TARGET_NAME)
     set(fluid_modules ${fluid_modules} ${TARGET_NAME})
     set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
   endif()
-endfunction(find_fluid_modules)
+endfunction()
 
 set_property(GLOBAL PROPERTY PHI_MODULES "")
 # find all phi modules is used for paddle static library
@@ -131,7 +131,7 @@ function(find_phi_modules TARGET_NAME)
     set(phi_modules ${phi_modules} ${TARGET_NAME})
     set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")
   endif()
-endfunction(find_phi_modules)
+endfunction()
 
 function(common_link TARGET_NAME)
   if(WITH_PROFILER)
@@ -152,7 +152,7 @@ function(find_fluid_thirdparties TARGET_NAME)
     set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
     set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
   endif()
-endfunction(find_fluid_thirdparties)
+endfunction()
 
 function(create_static_lib TARGET_NAME)
   set(libs ${ARGN})
@@ -315,7 +315,7 @@ function(check_coverage_opt TARGET_NAME SRCS)
             set(use_coverage_opt TRUE)
             break()
           endif()
-        endforeach(cc_file)
+        endforeach()
 
         if(use_coverage_opt)
           message(STATUS "cc changed, add coverage opt for ${TARGET_NAME}")
@@ -331,7 +331,7 @@ function(check_coverage_opt TARGET_NAME SRCS)
       endif()
     endif()
   endif()
-endfunction(check_coverage_opt)
+endfunction()
 
 function(cc_library TARGET_NAME)
   set(options STATIC static SHARED shared INTERFACE interface)
@@ -344,7 +344,7 @@ function(cc_library TARGET_NAME)
     set(${TARGET_NAME}_LIB_NAME
         "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE STRING "output library name for target ${TARGET_NAME}")
-  endif(WIN32)
+  endif()
   if(cc_library_SRCS)
     if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
@@ -372,10 +372,10 @@ function(cc_library TARGET_NAME)
         add_dependencies(${TARGET_NAME} mklml)
         if(WIN32)
           target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
-        else(WIN32)
+        else()
           target_link_libraries(${TARGET_NAME}
                                 "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
-        endif(WIN32)
+        endif()
       endif()
       # remove link to python, see notes at:
       # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually
@@ -386,7 +386,7 @@ function(cc_library TARGET_NAME)
           target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
         else()
           target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup")
-        endif(WIN32)
+        endif()
       endif()
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       common_link(${TARGET_NAME})
@@ -402,7 +402,7 @@ function(cc_library TARGET_NAME)
 
     check_coverage_opt(${TARGET_NAME} ${cc_library_SRCS})
 
-  else(cc_library_SRCS)
+  else()
     if(cc_library_DEPS)
       list(REMOVE_DUPLICATES cc_library_DEPS)
 
@@ -417,8 +417,8 @@ function(cc_library TARGET_NAME)
           "Please specify source files or libraries in cc_library(${TARGET_NAME} ...)."
       )
     endif()
-  endif(cc_library_SRCS)
-endfunction(cc_library)
+  endif()
+endfunction()
 
 function(cc_binary TARGET_NAME)
   set(options "")
@@ -440,7 +440,7 @@ function(cc_binary TARGET_NAME)
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
-endfunction(cc_binary)
+endfunction()
 
 function(cc_test_build TARGET_NAME)
   if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
@@ -454,7 +454,7 @@ function(cc_test_build TARGET_NAME)
         list(REMOVE_ITEM cc_test_DEPS python)
         target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
       endif()
-    endif(WIN32)
+    endif()
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(
       ${TARGET_NAME}
@@ -539,7 +539,7 @@ function(cc_test TARGET_NAME)
     add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E echo CI skip
                                          ${TARGET_NAME}.)
   endif()
-endfunction(cc_test)
+endfunction()
 
 function(nv_library TARGET_NAME)
   if(WITH_GPU)
@@ -572,7 +572,7 @@ function(nv_library TARGET_NAME)
                ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
-    else(nv_library_SRCS)
+    else()
       if(nv_library_DEPS)
         list(REMOVE_DUPLICATES nv_library_DEPS)
         generate_dummy_static_lib(
@@ -584,7 +584,7 @@ function(nv_library TARGET_NAME)
       else()
         message(FATAL "Please specify source file or library in nv_library.")
       endif()
-    endif(nv_library_SRCS)
+    endif()
     if((CUDA_VERSION GREATER 9.2)
        AND (CUDA_VERSION LESS 11.0)
        AND (MSVC_VERSION LESS 1910))
@@ -592,7 +592,7 @@ function(nv_library TARGET_NAME)
                                                       ${WIN_PROPS})
     endif()
   endif()
-endfunction(nv_library)
+endfunction()
 
 function(nv_binary TARGET_NAME)
   if(WITH_GPU)
@@ -608,13 +608,13 @@ function(nv_binary TARGET_NAME)
       common_link(${TARGET_NAME})
     endif()
     if((CUDA_VERSION GREATER 9.2)
-       AND (CUDA_VERSION LESS 11.0)
-       AND (MSVC_VERSION LESS 1910))
+       and (CUDA_VERSION LESS 11.0)
+       and (MSVC_VERSION LESS 1910))
       set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS
                                                       ${WIN_PROPS})
     endif()
   endif()
-endfunction(nv_binary)
+endfunction()
 
 function(nv_test TARGET_NAME)
   # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
@@ -667,7 +667,7 @@ function(nv_test TARGET_NAME)
                                                       ${WIN_PROPS})
     endif()
   endif()
-endfunction(nv_test)
+endfunction()
 
 function(hip_library TARGET_NAME)
   if(WITH_ROCM)
@@ -702,7 +702,7 @@ function(hip_library TARGET_NAME)
                ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
-    else(hip_library_SRCS)
+    else()
       if(hip_library_DEPS)
         list(REMOVE_DUPLICATES hip_library_DEPS)
         generate_dummy_static_lib(
@@ -714,9 +714,9 @@ function(hip_library TARGET_NAME)
       else()
         message(FATAL "Please specify source file or library in hip_library.")
       endif()
-    endif(hip_library_SRCS)
+    endif()
   endif()
-endfunction(hip_library)
+endfunction()
 
 function(hip_binary TARGET_NAME)
   if(WITH_ROCM)
@@ -733,7 +733,7 @@ function(hip_binary TARGET_NAME)
       common_link(${TARGET_NAME})
     endif()
   endif()
-endfunction(hip_binary)
+endfunction()
 
 function(hip_test TARGET_NAME)
   # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
@@ -746,7 +746,8 @@ function(hip_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
-    # FindHIP.cmake defined hip_add_executable, HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
+    # FindHIP.cmake defined hip_add_executable,
+    # HIP_SOURCE_PROPERTY_FORMAT is requried for .cc files
     hip_add_executable(${TARGET_NAME} ${hip_test_SRCS})
     # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
     target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
@@ -785,7 +786,7 @@ function(hip_test TARGET_NAME)
         "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH"
     )
   endif()
-endfunction(hip_test)
+endfunction()
 
 function(xpu_library TARGET_NAME)
   if(WITH_XPU_KP)
@@ -817,7 +818,7 @@ function(xpu_library TARGET_NAME)
                ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
-    else(xpu_library_SRCS)
+    else()
       if(xpu_library_DEPS)
         list(REMOVE_DUPLICATES xpu_library_DEPS)
         generate_dummy_static_lib(
@@ -828,9 +829,9 @@ function(xpu_library TARGET_NAME)
       else()
         message(FATAL "Please specify source file or library in xpu_library.")
       endif()
-    endif(xpu_library_SRCS)
+    endif()
   endif()
-endfunction(xpu_library)
+endfunction()
 
 function(xpu_binary TARGET_NAME)
   if(WITH_XPU_KP)
@@ -846,7 +847,7 @@ function(xpu_binary TARGET_NAME)
       common_link(${TARGET_NAME})
     endif()
   endif()
-endfunction(xpu_binary)
+endfunction()
 
 function(xpu_test TARGET_NAME)
   # The environment variable `CI_SKIP_CPP_TEST` is used to skip the compilation
@@ -891,7 +892,7 @@ function(xpu_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
                                               FLAGS_cudnn_deterministic=true)
   endif()
-endfunction(xpu_test)
+endfunction()
 
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
@@ -934,7 +935,7 @@ function(go_library TARGET_NAME)
   if(go_library_DEPS)
     add_dependencies(${TARGET_NAME} ${go_library_DEPS})
     common_link(${TARGET_NAME})
-  endif(go_library_DEPS)
+  endif()
 
   # The "source file" of the library is `${dummyfile}` which never
   # change, so the target will never rebuild. Make the target depends
@@ -965,7 +966,7 @@ function(go_library TARGET_NAME)
     # must run under GOPATH
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   add_dependencies(${TARGET_NAME} go_vendor)
-endfunction(go_library)
+endfunction()
 
 function(go_binary TARGET_NAME)
   set(options OPTIONAL)
@@ -990,7 +991,7 @@ function(go_binary TARGET_NAME)
 
   check_coverage_opt(${TARGET_NAME} ${go_binary_SRCS})
 
-endfunction(go_binary)
+endfunction()
 
 function(go_test TARGET_NAME)
   set(options OPTIONAL)
@@ -1013,7 +1014,7 @@ function(go_test TARGET_NAME)
     NAME ${TARGET_NAME}
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-endfunction(go_test)
+endfunction()
 
 # Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
 # Usage:
@@ -1146,7 +1147,7 @@ function(grpc_library TARGET_NAME)
   get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
   get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
 
-  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
+  # FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
   # somehow it didn't. line 602 to 604 is to patching this. Leaving this here
   # for now to enable dist CI.
   paddle_protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 14ae8efb5b4f8..a8e3696418bd4 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -108,6 +108,14 @@ function(copy_part_of_thrid_party TARGET DST)
         SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
         DSTS ${dst_dir} ${dst_dir})
     endif()
+
+    if(WITH_SPARSELT)
+      set(dst_dir "${DST}/third_party/install/cusparselt")
+      copy(
+        ${TARGET}
+        SRCS ${CUSPARSELT_INC_DIR} ${CUSPARSELT_LIB_DIR}
+        DSTS ${dst_dir} ${dst_dir})
+    endif()
   endif()
 
   if(WITH_MKLDNN)
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index 4555d892f11ce..82d04f0c4695d 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -61,7 +61,9 @@ endfunction()
 
 # call kernel_declare need to make sure whether the target of input exists
 function(kernel_declare TARGET_LIST)
+  # message("TARGET LIST ${TARGET_LIST}")
   foreach(kernel_path ${TARGET_LIST})
+    # message("kernel path ${kernel_path}" )
     file(READ ${kernel_path} kernel_impl)
     string(
       REGEX
@@ -111,6 +113,7 @@ function(kernel_declare TARGET_LIST)
 endfunction()
 
 function(kernel_library TARGET)
+  return()
   set(common_srcs)
   set(cpu_srcs)
   set(gpu_srcs)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 0562077eae187..7df5f8a4b6c12 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -23,11 +23,11 @@
 if(UNIX AND NOT APPLE)
   # except apple from nix*Os family
   set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)
+endif()
 
 if(WIN32)
   set(HOST_SYSTEM "win32")
-else(WIN32)
+else()
   if(APPLE)
     set(HOST_SYSTEM "macosx")
     exec_program(
@@ -45,7 +45,7 @@ else(WIN32)
       )
     endif()
     set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-  else(APPLE)
+  else()
 
     if(EXISTS "/etc/issue")
       file(READ "/etc/issue" LINUX_ISSUE)
@@ -63,21 +63,21 @@ else(WIN32)
 
       string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION
                    "${LINUX_ISSUE}")
-    endif(EXISTS "/etc/issue")
+    endif()
 
     if(EXISTS "/etc/redhat-release")
       file(READ "/etc/redhat-release" LINUX_ISSUE)
       if(LINUX_ISSUE MATCHES "CentOS")
         set(HOST_SYSTEM "centos")
       endif()
-    endif(EXISTS "/etc/redhat-release")
+    endif()
 
     if(NOT HOST_SYSTEM)
       set(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
     endif()
 
-  endif(APPLE)
-endif(WIN32)
+  endif()
+endif()
 
 # query number of logical cores
 cmake_host_system_information(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 2004241ab1a76..af76f3ffbe027 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -356,17 +356,17 @@ if(WITH_GPU)
       COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2}
       COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
   endif()
-endif(WITH_GPU)
+endif()
 
 if(WITH_XPU)
   include(external/xpu) # download, build, install xpu
   list(APPEND third_party_deps extern_xpu)
-endif(WITH_XPU)
+endif()
 
 if(WITH_MLU)
   include(external/concurrentqueue) # download, build, install concurrentqueue
   list(APPEND third_party_deps extern_concurrentqueue)
-endif(WITH_MLU)
+endif()
 
 if(WITH_PSLIB)
   include(external/pslib) # download, build, install pslib
@@ -389,7 +389,7 @@ if(WITH_PSLIB)
       list(APPEND third_party_deps extern_brpc)
     endif()
   endif()
-endif(WITH_PSLIB)
+endif()
 
 if(NOT WIN32 AND NOT APPLE)
   include(external/gloo)
@@ -399,7 +399,7 @@ endif()
 if(WITH_BOX_PS)
   include(external/box_ps)
   list(APPEND third_party_deps extern_box_ps)
-endif(WITH_BOX_PS)
+endif()
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
   include(external/ascend)
@@ -453,7 +453,7 @@ endif()
 if(WITH_LITE)
   message(STATUS "Compile Paddle with Lite Engine.")
   include(external/lite)
-endif(WITH_LITE)
+endif()
 
 if(WITH_CINN)
   message(STATUS "Compile Paddle with CINN.")
@@ -462,29 +462,29 @@ if(WITH_CINN)
   if(WITH_GPU)
     add_definitions(-DCINN_WITH_CUDA)
     add_definitions(-DCINN_WITH_CUDNN)
-  endif(WITH_GPU)
+  endif()
   if(WITH_MKL)
     add_definitions(-DCINN_WITH_MKL_CBLAS)
     add_definitions(-DCINN_WITH_MKLDNN)
-  endif(WITH_MKL)
-endif(WITH_CINN)
+  endif()
+endif()
 
 if(WITH_CRYPTO)
   include(external/cryptopp) # download, build, install cryptopp
   list(APPEND third_party_deps extern_cryptopp)
   add_definitions(-DPADDLE_WITH_CRYPTO)
-endif(WITH_CRYPTO)
+endif()
 
 if(WITH_POCKETFFT)
   include(external/pocketfft)
   list(APPEND third_party_deps extern_pocketfft)
   add_definitions(-DPADDLE_WITH_POCKETFFT)
-endif(WITH_POCKETFFT)
+endif()
 
 if(WIN32)
   include(external/dirent)
   list(APPEND third_party_deps extern_dirent)
-endif(WIN32)
+endif()
 
 if(WITH_INFRT)
   include(external/llvm)
@@ -496,4 +496,9 @@ if(WITH_IPU)
   list(APPEND third_party_deps extern_poplar)
 endif()
 
+if(WITH_CUSPARSELT)
+  include(external/cusparselt) # download, build, install cusparselt
+  list(APPEND third_party_deps extern_cusparselt)
+endif()
+
 add_custom_target(third_party ALL DEPENDS ${third_party_deps})
diff --git a/cmake/unity_build.cmake b/cmake/unity_build.cmake
index e18b2ef1ee686..d1b97cf08f60c 100644
--- a/cmake/unity_build.cmake
+++ b/cmake/unity_build.cmake
@@ -67,7 +67,7 @@ function(register_unity_group TYPE)
   math(EXPR unity_group_index "${unity_group_index} + 1")
   set_property(GLOBAL PROPERTY ${UNITY_TARGET}_${TYPE}_group_index
                                ${unity_group_index})
-endfunction(register_unity_group)
+endfunction()
 
 # Combine the original source files used by `TARGET`, then use
 # `unity_target_${TYPE}_sources` to get the combined source files.
@@ -145,7 +145,7 @@ function(compose_unity_target_sources TARGET TYPE)
   set(unity_target_${TYPE}_sources
       ${unity_target_sources}
       PARENT_SCOPE)
-endfunction(compose_unity_target_sources)
+endfunction()
 
 # Write the unity files used by `UNITY_TARGET`.
 # Write dependent on whether the contents of the unity file have changed, which
@@ -179,4 +179,4 @@ function(finish_unity_target TYPE)
       endif()
     endforeach()
   endif()
-endfunction(finish_unity_target)
+endfunction()
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 304a764f5b87c..0b5f608122683 100755
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_PYTHON)
       POST_BUILD
       COMMAND mv the_one_ps_pb2.py
               ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/)
-  else(NOT WIN32)
+  else()
     string(
       REPLACE "/" "\\" fleet_proto_dstpath
               "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
@@ -24,7 +24,7 @@ if(WITH_PYTHON)
       COMMENT
         "Copy generated python the_one_ps_pb2 into directory ${fleet_proto_dstpath}."
     )
-  endif(NOT WIN32)
+  endif()
 endif()
 
 if(NOT WITH_PSCORE)
diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
index 89466076b23d0..47e3476036d7e 100644
--- a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -197,7 +197,7 @@ int32_t BrpcPsClient::Initialize() {
 
   // 异步push 请求队列初始化
   const auto &worker_param = _config.worker_param().downpour_worker_param();
-  for (size_t i = 0; i < worker_param.downpour_table_param_size(); ++i) {
+  for (int i = 0; i < worker_param.downpour_table_param_size(); ++i) {
     auto type = worker_param.downpour_table_param(i).type();
     auto table_id = worker_param.downpour_table_param(i).table_id();
     if (type == PS_DENSE_TABLE) {
@@ -662,7 +662,7 @@ std::future<int32_t> BrpcPsClient::PushSparseParam(size_t table_id,
     char *push_data_ptr = const_cast<char *>(push_data->data());
     memcpy(push_data_ptr, kvs.data(), kv_size * sizeof(uint64_t));
     push_data_ptr += kv_size * sizeof(uint64_t);
-    for (int i = 0; i < kv_size; ++i) {
+    for (size_t i = 0; i < kv_size; ++i) {
       memcpy(push_data_ptr, value_ptr[i], value_size);
       push_data_ptr += value_size;
     }
@@ -882,7 +882,7 @@ std::future<int32_t> BrpcPsClient::PushSparseRawGradient(
     memcpy(push_data_ptr, kvs.data(), kv_size * sizeof(uint64_t));
     push_data_ptr += kv_size * sizeof(uint64_t);
 
-    for (int i = 0; i < kv_size; ++i) {
+    for (size_t i = 0; i < kv_size; ++i) {
       memcpy(push_data_ptr, value_ptr[i], value_size);
       push_data_ptr += value_size;
     }
@@ -1237,7 +1237,7 @@ std::future<int32_t> BrpcPsClient::PushSparseRawGradientPartial(
   char *push_data_ptr = const_cast<char *>(push_data->data());
   memcpy(push_data_ptr, keys, num * sizeof(uint64_t));
   push_data_ptr += num * sizeof(uint64_t);
-  for (int i = 0; i < num; ++i) {
+  for (uint32_t i = 0; i < num; ++i) {
     memcpy(push_data_ptr, update_values[i], value_size);
     push_data_ptr += value_size;
   }
@@ -1257,7 +1257,7 @@ int32_t BrpcPsClient::RecvAndSaveTable(const uint64_t table_id,
   int64_t var_shape = 0;
   std::string table_class;
   const auto &worker_param = _config.worker_param().downpour_worker_param();
-  for (size_t i = 0; i < worker_param.downpour_table_param_size(); ++i) {
+  for (int i = 0; i < worker_param.downpour_table_param_size(); ++i) {
     if (worker_param.downpour_table_param(i).table_id() == table_id) {
       var_name = worker_param.downpour_table_param(i).common().table_name();
       var_num = worker_param.downpour_table_param(i).common().table_num();
@@ -1481,13 +1481,13 @@ void BrpcPsClient::PushSparseTaskConsume() {
         closure->add_timer(rpc_timer);
 
         std::vector<std::future<int>> merge_status(request_call_num);
-        for (int shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
+        for (size_t shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
           merge_status[shard_idx] =
               async_push_sparse_shard_threads.enqueue(std::bind(
                   &BrpcPsClient::PushSparseAsyncShardPush, this, task_list,
                   request_kv_num, table_id, shard_idx, closure, accessor));
         }
-        for (int shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
+        for (size_t shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
           merge_status[shard_idx].wait();
         }
         merge_status.clear();
@@ -1497,13 +1497,13 @@ void BrpcPsClient::PushSparseTaskConsume() {
         auto queue_size = task_queue->Size();
       } else {  // 未达到阈值 只做多路归并
         std::vector<std::future<int>> merge_status(request_call_num);
-        for (int shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
+        for (size_t shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
           merge_status[shard_idx] =
               async_push_sparse_shard_threads.enqueue(std::bind(
                   &BrpcPsClient::PushSparseAsyncShardMerge, this, task_list,
                   request_kv_num, table_id, shard_idx, accessor));
         }
-        for (int shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
+        for (size_t shard_idx = 0; shard_idx < request_call_num; ++shard_idx) {
           merge_status[shard_idx].wait();
         }
 
@@ -1529,7 +1529,7 @@ void sparse_local_merge(ValueAccessor *accessor, float *merge_data,
   size_t col_num = accessor->GetAccessorInfo().update_dim;
   float *merge_data_shell[col_num];
   const float *another_data_shell[col_num];
-  for (int i = 0; i < col_num; ++i) {
+  for (size_t i = 0; i < col_num; ++i) {
     merge_data_shell[i] = merge_data + i;
     another_data_shell[i] = another_data + i;
   }
@@ -1546,12 +1546,12 @@ int BrpcPsClient::PushSparseAsyncShardMerge(
 
   thread_local std::vector<std::pair<uint64_t, const float *>> sorted_kv_list;
   sorted_kv_list.clear();
-  for (int i = 1; i < task_list.size(); ++i) {
+  for (size_t i = 1; i < task_list.size(); ++i) {
     size_t kv_num = task_list[i]->data()->shared_data[shard_idx].kv_num;
     auto &key_list = task_list[i]->data()->shared_data[shard_idx].key_list;
     auto &value_list = task_list[i]->data()->shared_data[shard_idx].value_list;
 
-    for (int j = 0; j < kv_num; ++j) {
+    for (size_t j = 0; j < kv_num; ++j) {
       if (value_list[j].size() < value_size) {
         LOG(WARNING) << "value_list[" << j << "]: " << value_list[j].c_str()
                      << "is invalid.";
@@ -1654,7 +1654,7 @@ int BrpcPsClient::PushSparseAsyncShardPush(
   memcpy(push_data_ptr, merged_key_list.data(),
          merged_kv_count * sizeof(uint64_t));
   push_data_ptr += merged_kv_count * sizeof(uint64_t);
-  for (int i = 0; i < merged_kv_count; ++i) {
+  for (size_t i = 0; i < merged_kv_count; ++i) {
     const char *task_data_ptr = merged_value_list[i].data();
 
     memcpy(push_data_ptr, (float *)(task_data_ptr),  // NOLINT
@@ -1778,7 +1778,7 @@ void BrpcPsClient::PushDenseTaskConsume() {
               });
           ++merge_count;
         }
-        for (int i = 0; i < merge_count; ++i) {
+        for (uint32_t i = 0; i < merge_count; ++i) {
           merge_status[i].wait();
         }
 
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
index ff9680044dd6b..65b3cc9d0f892 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc
@@ -60,7 +60,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     std::vector<std::vector<std::string>> &res) {
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
     if (server2request[server_index] == -1) {
       server2request[server_index] = request2server.size();
@@ -70,7 +70,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
   size_t request_call_num = request2server.size();
   std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
-  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
     int request_idx = server2request[server_index];
     node_id_buckets[request_idx].push_back(node_ids[query_idx]);
@@ -83,7 +83,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
         size_t fail_num = 0;
-        for (int request_idx = 0; request_idx < request_call_num;
+        for (size_t request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
           if (closure->check_response(request_idx, PS_GRAPH_GET_NODE_FEAT) !=
               0) {
@@ -122,7 +122,7 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
   closure->add_promise(promise);
   std::future<int> fut = promise->get_future();
 
-  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
     int server_index = request2server[request_idx];
     closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT);
     closure->request(request_idx)->set_table_id(table_id);
@@ -271,7 +271,7 @@ std::future<int32_t> GraphBrpcClient::remove_graph_node(
       request_call_num, [&, request_call_num](void *done) {
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
-        int fail_num = 0;
+        size_t fail_num = 0;
         for (size_t request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
           if (closure->check_response(request_idx,
@@ -378,7 +378,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
   std::vector<int> server2request(server_size, -1);
   res.clear();
   res_weight.clear();
-  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
     if (server2request[server_index] == -1) {
       server2request[server_index] = request2server.size();
@@ -393,7 +393,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
   size_t request_call_num = request2server.size();
   std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
-  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
     int request_idx = server2request[server_index];
     node_id_buckets[request_idx].push_back(node_ids[query_idx]);
@@ -454,7 +454,7 @@ std::future<int32_t> GraphBrpcClient::batch_sample_neighbors(
   closure->add_promise(promise);
   std::future<int> fut = promise->get_future();
 
-  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
     int server_index = request2server[request_idx];
     closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBORS);
     closure->request(request_idx)->set_table_id(table_id);
@@ -492,7 +492,7 @@ std::future<int32_t> GraphBrpcClient::random_sample_nodes(
       size_t bytes_size = io_buffer_itr.bytes_left();
       char *buffer = new char[bytes_size];
       auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
-      int index = 0;
+      size_t index = 0;
       while (index < bytes_size) {
         ids.push_back(*(int64_t *)(buffer + index));
         index += GraphNode::id_size;
@@ -534,7 +534,7 @@ std::future<int32_t> GraphBrpcClient::pull_graph_list(
       size_t bytes_size = io_buffer_itr.bytes_left();
       char *buffer = new char[bytes_size];
       io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
-      int index = 0;
+      size_t index = 0;
       while (index < bytes_size) {
         FeatureNode node;
         node.recover_from_buffer(buffer + index);
@@ -570,7 +570,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
     const std::vector<std::vector<std::string>> &features) {
   std::vector<int> request2server;
   std::vector<int> server2request(server_size, -1);
-  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
     if (server2request[server_index] == -1) {
       server2request[server_index] = request2server.size();
@@ -582,7 +582,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
   std::vector<std::vector<std::vector<std::string>>> features_idx_buckets(
       request_call_num);
-  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
     int server_index = get_server_index_by_id(node_ids[query_idx]);
     int request_idx = server2request[server_index];
     node_id_buckets[request_idx].push_back(node_ids[query_idx]);
@@ -590,7 +590,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
     if (features_idx_buckets[request_idx].size() == 0) {
       features_idx_buckets[request_idx].resize(feature_names.size());
     }
-    for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+    for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
       features_idx_buckets[request_idx][feat_idx].push_back(
           features[feat_idx][query_idx]);
     }
@@ -602,7 +602,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
         int ret = 0;
         auto *closure = (DownpourBrpcClosure *)done;
         size_t fail_num = 0;
-        for (int request_idx = 0; request_idx < request_call_num;
+        for (size_t request_idx = 0; request_idx < request_call_num;
              ++request_idx) {
           if (closure->check_response(request_idx, PS_GRAPH_SET_NODE_FEAT) !=
               0) {
@@ -619,7 +619,7 @@ std::future<int32_t> GraphBrpcClient::set_node_feat(
   closure->add_promise(promise);
   std::future<int> fut = promise->get_future();
 
-  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+  for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) {
     int server_index = request2server[request_idx];
     closure->request(request_idx)->set_cmd_id(PS_GRAPH_SET_NODE_FEAT);
     closure->request(request_idx)->set_table_id(table_id);
diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
index 5ce26b4525041..ce9397e511eb0 100644
--- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
+++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc
@@ -516,7 +516,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<int64_t> local_id;
   std::vector<int> local_query_idx;
   size_t rank = GetRank();
-  for (int query_idx = 0; query_idx < node_num; ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
         ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]);
     if (server2request[server_index] == -1) {
@@ -538,7 +538,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   std::vector<size_t> seq;
   std::vector<std::vector<int64_t>> node_id_buckets(request_call_num);
   std::vector<std::vector<int>> query_idx_buckets(request_call_num);
-  for (int query_idx = 0; query_idx < node_num; ++query_idx) {
+  for (size_t query_idx = 0; query_idx < node_num; ++query_idx) {
     int server_index =
         ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]);
     int request_idx = server2request[server_index];
@@ -614,7 +614,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers(
   closure->add_promise(promise);
   std::future<int> fut = promise->get_future();
 
-  for (int request_idx = 0; request_idx < remote_call_num; ++request_idx) {
+  for (size_t request_idx = 0; request_idx < remote_call_num; ++request_idx) {
     int server_index = request2server[request_idx];
     closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBORS);
     closure->request(request_idx)->set_table_id(request.table_id());
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
index 254bbb96cad62..98c1baf6befaa 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.cc
@@ -196,7 +196,7 @@ bool CtrCommonAccessor::NeedExtendMF(float* value) {
   return score >= _config.embedx_threshold();
 }
 
-bool CtrCommonAccessor::HasMF(size_t size) {
+bool CtrCommonAccessor::HasMF(int size) {
   return size > common_feature_value.EmbedxG2SumIndex();
 }
 
@@ -227,11 +227,11 @@ int32_t CtrCommonAccessor::Merge(float** update_values,
                                  const float** other_update_values,
                                  size_t num) {
   auto embedx_dim = _config.embedx_dim();
-  size_t total_dim = CtrCommonPushValue::Dim(embedx_dim);
+  int total_dim = CtrCommonPushValue::Dim(embedx_dim);
   for (size_t value_item = 0; value_item < num; ++value_item) {
     float* update_value = update_values[value_item];
     const float* other_update_value = other_update_values[value_item];
-    for (auto i = 0u; i < total_dim; ++i) {
+    for (int i = 0; i < total_dim; ++i) {
       if (i != CtrCommonPushValue::SlotIndex()) {
         update_value[i] += other_update_value[i];
       }
diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h
index 96ec5b8398d13..6bc3e53401ed8 100644
--- a/paddle/fluid/distributed/ps/table/ctr_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h
@@ -143,7 +143,7 @@ class CtrCommonAccessor : public ValueAccessor {
   // 判断该value是否保存到ssd
   // virtual bool save_ssd(float* value);
   virtual bool NeedExtendMF(float* value);
-  virtual bool HasMF(size_t size);
+  virtual bool HasMF(int size);
   // 判断该value是否在save阶段dump,
   // param作为参数用于标识save阶段，如downpour的xbox与batch_model
   // param = 0, save all feature
diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
index 2bde5271a0c43..44c672eff61c0 100644
--- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.cc
@@ -139,7 +139,7 @@ bool CtrDoubleAccessor::Save(float* value, int param) {
     }
     default:
       return true;
-  };
+  }
 }
 
 void CtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) {
@@ -166,7 +166,7 @@ void CtrDoubleAccessor::UpdateStatAfterSave(float* value, int param) {
       return;
     default:
       return;
-  };
+  }
 }
 
 int32_t CtrDoubleAccessor::Create(float** values, size_t num) {
@@ -175,7 +175,7 @@ int32_t CtrDoubleAccessor::Create(float** values, size_t num) {
     float* value = values[value_item];
     value[CtrDoubleFeatureValue::UnseenDaysIndex()] = 0;
     value[CtrDoubleFeatureValue::DeltaScoreIndex()] = 0;
-    *(double*)(value + CtrDoubleFeatureValue::ShowIndex()) = 0;
+    *reinterpret_cast<double*>(value + CtrDoubleFeatureValue::ShowIndex()) = 0;
     *(double*)(value + CtrDoubleFeatureValue::ClickIndex()) = 0;
     value[CtrDoubleFeatureValue::SlotIndex()] = -1;
     _embed_sgd_rule->InitValue(
@@ -233,7 +233,7 @@ int32_t CtrDoubleAccessor::Merge(float** update_values,
     for (auto i = 3u; i < total_dim; ++i) {
         update_value[i] += other_update_value[i];
     }*/
-    for (auto i = 0u; i < total_dim; ++i) {
+    for (size_t i = 0; i < total_dim; ++i) {
       if (i != CtrDoublePushValue::SlotIndex()) {
         update_value[i] += other_update_value[i];
       }
@@ -320,7 +320,7 @@ std::string CtrDoubleAccessor::ParseToString(const float* v, int param_size) {
   auto score = ShowClickScore(show, click);
   if (score >= _config.embedx_threshold() && param_size > 9) {
     os << " " << v[9];
-    for (auto i = 0; i < _config.embedx_dim(); ++i) {
+    for (size_t i = 0; i < _config.embedx_dim(); ++i) {
       os << " " << v[10 + i];
     }
   }
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
index 6fb6675edde8d..a3b2c28842749 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc
@@ -198,7 +198,7 @@ bool CtrDymfAccessor::NeedExtendMF(float* value) {
   return score >= _config.embedx_threshold();
 }
 
-bool CtrDymfAccessor::HasMF(size_t size) {
+bool CtrDymfAccessor::HasMF(int size) {
   return size > common_feature_value.EmbedxG2SumIndex();
 }
 
diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
index c4bcd2bb3c98a..f2041e60a2c06 100644
--- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
+++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h
@@ -158,7 +158,7 @@ class CtrDymfAccessor : public ValueAccessor {
   // 判断该value是否保存到ssd
   // virtual bool save_ssd(float* value);
   virtual bool NeedExtendMF(float* value);
-  virtual bool HasMF(size_t size);
+  virtual bool HasMF(int size);
   // 判断该value是否在save阶段dump,
   // param作为参数用于标识save阶段，如downpour的xbox与batch_model
   // param = 0, save all feature
diff --git a/paddle/fluid/distributed/ps/table/memory_dense_table.cc b/paddle/fluid/distributed/ps/table/memory_dense_table.cc
index 58ec8503c8156..ab1361eba050f 100644
--- a/paddle/fluid/distributed/ps/table/memory_dense_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_dense_table.cc
@@ -41,7 +41,7 @@ void MemoryDenseTable::CreateInitializer(const std::string& attr,
 
 int32_t MemoryDenseTable::Initialize() {
   _shards_task_pool.resize(task_pool_size_);
-  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
   }
 
@@ -74,14 +74,14 @@ int32_t MemoryDenseTable::InitializeValue() {
     values_[x].resize(dim);
     names_index_[varname] = x;
 
-    for (int y = 0; y < dim; ++y) {
+    for (size_t y = 0; y < dim; ++y) {
       values_[x][y] = initializers_[varname]->GetValue();
     }
   }
 
   fixed_len_params_dim_ = 0;
   for (int x = 0; x < size; ++x) {
-    auto& dim = common.dims()[x];
+    int dim = common.dims()[x];
     if (dim != param_dim_) {
       fixed_len_params_dim_ += dim;
     } else {
@@ -245,14 +245,14 @@ int32_t MemoryDenseTable::Load(const std::string& path,
   do {
     is_read_failed = false;
     try {
-      size_t dim_idx = 0;
+      int dim_idx = 0;
       float data_buffer[5];
       float* data_buff_ptr = data_buffer;
       std::string line_data;
       int size = static_cast<int>(values_.size());
       auto common = _config.common();
 
-      for (int i = start_file_idx; i < end_file_idx + 1; ++i) {
+      for (size_t i = start_file_idx; i < end_file_idx + 1; ++i) {
         channel_config.path = file_list[i];
         err_no = 0;
         auto read_channel = _afs_client.open_r(channel_config, 0, &err_no);
@@ -271,12 +271,12 @@ int32_t MemoryDenseTable::Load(const std::string& path,
           if (file_dim_idx < file_start_idx) {
             continue;
           }
-          auto str_len =
+          size_t str_len =
               paddle::string::str_to_float(line_data.data(), data_buff_ptr);
           CHECK(str_len == param_col_ids_.size())
               << "expect " << param_col_ids_.size() << " float, but got "
               << str_len;
-          for (size_t col_idx = 0; col_idx < str_len; ++col_idx) {
+          for (int col_idx = 0; col_idx < str_len; ++col_idx) {
             if (param_col_ids_[col_idx] < 0) {
               continue;
             }
@@ -355,7 +355,7 @@ int32_t MemoryDenseTable::Save(const std::string& path,
     std::ostringstream os;
     for (int x = 0; x < size; ++x) {
       auto& varname = common.params()[x];
-      auto& dim = common.dims()[x];
+      int dim = common.dims()[x];
       VLOG(3) << "MemoryDenseTable::save dim " << x << " size: " << dim;
       for (int y = 0; y < dim; ++y) {
         os.clear();
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
index 1567d31d0f3ee..e839bf6c151df 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_geo_table.cc
@@ -49,7 +49,7 @@ int32_t MemorySparseGeoTable::PushSparseParam(const uint64_t* keys,
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(shard_num);
 
-  for (int x = 0; x < num; ++x) {
+  for (size_t x = 0; x < num; ++x) {
     auto y = keys[x] % shard_num;
     offset_bucket[y].push_back(x);
     if (x < 10) {
@@ -66,7 +66,7 @@ int32_t MemorySparseGeoTable::PushSparseParam(const uint64_t* keys,
           auto& local_shard = _local_shards[shard_id];
           auto& offsets = offset_bucket[shard_id];
 
-          for (int i = 0; i < offsets.size(); ++i) {
+          for (size_t i = 0; i < offsets.size(); ++i) {
             auto offset = offsets[i];
             auto id = keys[offset];
             auto& feature_value = local_shard[id];
@@ -132,7 +132,7 @@ int32_t MemorySparseGeoTable::Initialize() {
 
   _dim = _config.common().dims()[0];
   _shards_task_pool.resize(_task_pool_size);
-  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
   }
 
@@ -200,14 +200,14 @@ int32_t MemorySparseGeoTable::_PushSparse(const uint64_t* keys,
     task_keys[shard_id].push_back({keys[i], i});
   }
 
-  for (size_t shard_id = 0; shard_id < shard_num; ++shard_id) {
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
     tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
         [this, shard_id, values, &task_keys]() -> int {
           auto& keys = task_keys[shard_id];
           auto& local_shard = _local_shards[shard_id];
           auto blas = GetBlas<float>();
 
-          for (int i = 0; i < keys.size(); ++i) {
+          for (size_t i = 0; i < keys.size(); ++i) {
             uint64_t key = keys[i].first;
             uint64_t push_data_idx = keys[i].second;
             const float* update_data = values + push_data_idx * _dim;
diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
index 464f788b454e8..171853f96672d 100644
--- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc
@@ -37,7 +37,7 @@ namespace distributed {
 
 int32_t MemorySparseTable::Initialize() {
   _shards_task_pool.resize(_task_pool_size);
-  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
     _shards_task_pool[i].reset(new ::ThreadPool(1));
   }
   auto& profiler = CostProfiler::instance();
@@ -79,7 +79,7 @@ int32_t MemorySparseTable::Load(const std::string& path,
   }
 
   int load_param = atoi(param.c_str());
-  auto expect_shard_num = _sparse_table_shard_num;
+  size_t expect_shard_num = _sparse_table_shard_num;
   if (file_list.size() != expect_shard_num) {
     LOG(WARNING) << "MemorySparseTable file_size:" << file_list.size()
                  << " not equal to expect_shard_num:" << expect_shard_num;
@@ -98,7 +98,7 @@ int32_t MemorySparseTable::Load(const std::string& path,
   int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15;
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     FsChannelConfig channel_config;
     channel_config.path = file_list[file_start_idx + i];
     VLOG(1) << "MemorySparseTable::load begin load " << channel_config.path
@@ -164,7 +164,7 @@ int32_t MemorySparseTable::LoadLocalFS(const std::string& path,
   auto file_list = paddle::framework::localfs_list(table_path);
 
   int load_param = atoi(param.c_str());
-  auto expect_shard_num = _sparse_table_shard_num;
+  size_t expect_shard_num = _sparse_table_shard_num;
   if (file_list.size() != expect_shard_num) {
     LOG(WARNING) << "MemorySparseTable file_size:" << file_list.size()
                  << " not equal to expect_shard_num:" << expect_shard_num;
@@ -183,7 +183,7 @@ int32_t MemorySparseTable::LoadLocalFS(const std::string& path,
   int thread_num = _real_local_shard_num < 15 ? _real_local_shard_num : 15;
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     bool is_read_failed = false;
     int retry_num = 0;
     int err_no = 0;
@@ -244,7 +244,7 @@ int32_t MemorySparseTable::Save(const std::string& dirname,
   int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path = paddle::string::format_string(
@@ -326,7 +326,7 @@ int32_t MemorySparseTable::SaveLocalFS(const std::string& dirname,
 
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     feasign_cnt = 0;
     auto& shard = _local_shards[i];
     std::string file_name = paddle::string::format_string(
@@ -354,7 +354,7 @@ int32_t MemorySparseTable::SaveLocalFS(const std::string& dirname,
 
 int64_t MemorySparseTable::LocalSize() {
   int64_t local_size = 0;
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     local_size += _local_shards[i].size();
   }
   return local_size;
@@ -364,7 +364,7 @@ int64_t MemorySparseTable::LocalMFSize() {
   std::vector<int64_t> size_arr(_real_local_shard_num, 0);
   std::vector<std::future<int>> tasks(_real_local_shard_num);
   int64_t ret_size = 0;
-  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+  for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
     tasks[shard_id] =
         _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
             [this, shard_id, &size_arr]() -> int {
@@ -378,7 +378,7 @@ int64_t MemorySparseTable::LocalMFSize() {
               return 0;
             });
   }
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     tasks[i].wait();
   }
   for (auto x : size_arr) {
@@ -469,7 +469,7 @@ int32_t MemorySparseTable::PullSparse(float* pull_values,
                   memcpy(data_buffer_ptr, itr.value().data(),
                          data_size * sizeof(float));
                 }
-                for (int mf_idx = data_size; mf_idx < value_size; ++mf_idx) {
+                for (size_t mf_idx = data_size; mf_idx < value_size; ++mf_idx) {
                   data_buffer[mf_idx] = 0.0;
                 }
                 auto offset = keys[i].second;
@@ -503,7 +503,7 @@ int32_t MemorySparseTable::PullSparsePtr(char** pull_values,
     task_keys[shard_id].push_back({keys[i], i});
   }
   // std::atomic<uint32_t> missed_keys{0};
-  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+  for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
     tasks[shard_id] =
         _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
             [this, shard_id, &task_keys, pull_values, value_size,
@@ -512,7 +512,7 @@ int32_t MemorySparseTable::PullSparsePtr(char** pull_values,
               auto& local_shard = _local_shards[shard_id];
               float data_buffer[value_size];
               float* data_buffer_ptr = data_buffer;
-              for (int i = 0; i < keys.size(); ++i) {
+              for (size_t i = 0; i < keys.size(); ++i) {
                 uint64_t key = keys[i].first;
                 auto itr = local_shard.find(key);
                 size_t data_size = value_size - mf_value_size;
@@ -558,7 +558,7 @@ int32_t MemorySparseTable::PushSparse(const uint64_t* keys, const float* values,
   size_t update_value_col =
       _value_accesor->GetAccessorInfo().update_size / sizeof(float);
 
-  for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+  for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
     tasks[shard_id] = _shards_task_pool[shard_id % _task_pool_size]->enqueue(
         [this, shard_id, value_col, mf_value_col, update_value_col, values,
          &task_keys]() -> int {
@@ -566,7 +566,7 @@ int32_t MemorySparseTable::PushSparse(const uint64_t* keys, const float* values,
           auto& local_shard = _local_shards[shard_id];
           float data_buffer[value_col];  // NOLINT
           float* data_buffer_ptr = data_buffer;
-          for (int i = 0; i < keys.size(); ++i) {
+          for (size_t i = 0; i < keys.size(); ++i) {
             uint64_t key = keys[i].first;
             uint64_t push_data_idx = keys[i].second;
             const float* update_data =
@@ -639,7 +639,7 @@ int32_t MemorySparseTable::PushSparse(const uint64_t* keys,
           auto& local_shard = _local_shards[shard_id];
           float data_buffer[value_col];  // NOLINT
           float* data_buffer_ptr = data_buffer;
-          for (int i = 0; i < keys.size(); ++i) {
+          for (size_t i = 0; i < keys.size(); ++i) {
             uint64_t key = keys[i].first;
             uint64_t push_data_idx = keys[i].second;
             const float* update_data = values[push_data_idx];
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.cc b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
index 772ff5d1fc5cc..081a77cedf792 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.cc
@@ -171,7 +171,7 @@ bool SparseAccessor::NeedExtendMF(float* value) {
   return score >= _config.embedx_threshold();
 }
 
-bool SparseAccessor::HasMF(size_t size) {
+bool SparseAccessor::HasMF(int size) {
   return size > sparse_feature_value.EmbedxG2SumIndex();
 }
 
@@ -201,7 +201,7 @@ int32_t SparseAccessor::Merge(float** update_values,
   for (size_t value_item = 0; value_item < num; ++value_item) {
     float* update_value = update_values[value_item];
     const float* other_update_value = other_update_values[value_item];
-    for (auto i = 0u; i < total_dim; ++i) {
+    for (size_t i = 0; i < total_dim; ++i) {
       if (i != SparsePushValue::SlotIndex()) {
         update_value[i] += other_update_value[i];
       }
diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h
index 5e76365901c27..dd5d3f3425aee 100644
--- a/paddle/fluid/distributed/ps/table/sparse_accessor.h
+++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h
@@ -130,7 +130,7 @@ class SparseAccessor : public ValueAccessor {
   // 判断该value是否保存到ssd
   // virtual bool save_ssd(float* value);
   virtual bool NeedExtendMF(float* value);
-  virtual bool HasMF(size_t size);
+  virtual bool HasMF(int size);
   // 判断该value是否在save阶段dump,
   // param作为参数用于标识save阶段，如downpour的xbox与batch_model
   // param = 0, save all feature
diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
index a9a4c9beae22c..ff9acef121a4a 100644
--- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
+++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc
@@ -90,7 +90,7 @@ void SparseAdaGradSGDRule::UpdateValueWork(float* w, float* sgd,
   float& g2sum = sgd[G2SumIndex()];
   double add_g2sum = 0;
 
-  for (int i = 0; i < _embedding_dim; i++) {
+  for (size_t i = 0; i < _embedding_dim; i++) {
     double scaled_grad = grad[i] / scale;
     w[i] -= learning_rate_ * scaled_grad *
             sqrt(_initial_g2sum / (_initial_g2sum + g2sum));
@@ -103,7 +103,7 @@ void SparseAdaGradSGDRule::UpdateValueWork(float* w, float* sgd,
 
 void SparseAdaGradSGDRule::InitValueWork(float* value, float* sgd,
                                          bool zero_init) {
-  for (int i = 0; i < _embedding_dim; ++i) {
+  for (size_t i = 0; i < _embedding_dim; ++i) {
     if (zero_init) {
       value[i] = 0.0;
       BoundValue(value[i]);
@@ -141,7 +141,7 @@ void StdAdaGradSGDRule::LoadConfig(const SparseCommonSGDRuleParameter& param,
 
 void StdAdaGradSGDRule::UpdateValueWork(float* w, float* sgd, const float* grad,
                                         float scale) {
-  for (int i = 0; i < _embedding_dim; i++) {
+  for (size_t i = 0; i < _embedding_dim; i++) {
     float& g2sum = sgd[G2SumIndex() + i];
     double scaled_grad = grad[i] / scale;
     w[i] -= learning_rate_ * scaled_grad *
@@ -153,7 +153,7 @@ void StdAdaGradSGDRule::UpdateValueWork(float* w, float* sgd, const float* grad,
 
 void StdAdaGradSGDRule::InitValueWork(float* value, float* sgd,
                                       bool zero_init) {
-  for (int i = 0; i < _embedding_dim; ++i) {
+  for (size_t i = 0; i < _embedding_dim; ++i) {
     if (zero_init) {
       value[i] = 0.0;
       BoundValue(value[i]);
@@ -204,7 +204,7 @@ void SparseAdamSGDRule::UpdateValueWork(float* w, float* sgd, const float* grad,
 
   // lr not change in one update
   lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
-  for (int i = 0; i < _embedding_dim; i++) {
+  for (size_t i = 0; i < _embedding_dim; i++) {
     // Calculation
     gsum[i] = _beta1_decay_rate * gsum[i] + (1 - _beta1_decay_rate) * g[i];
     g2sum[i] =
@@ -219,7 +219,7 @@ void SparseAdamSGDRule::UpdateValueWork(float* w, float* sgd, const float* grad,
 
 void SparseAdamSGDRule::InitValueWork(float* value, float* sgd,
                                       bool zero_init) {
-  for (int i = 0; i < _embedding_dim; ++i) {
+  for (size_t i = 0; i < _embedding_dim; ++i) {
     if (zero_init) {
       value[i] = 0.0;
       BoundValue(value[i]);
@@ -233,7 +233,7 @@ void SparseAdamSGDRule::InitValueWork(float* value, float* sgd,
     }
   }
   // init rule gsum and g2sum
-  for (int i = GSumIndex(); i < Beta1PowIndex(); i++) {
+  for (size_t i = GSumIndex(); i < Beta1PowIndex(); i++) {
     sgd[i] = 0.0;
   }
   // init beta1_pow and beta2_pow
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
index 7e1128baa0cd6..237d0c9424b81 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc
@@ -58,7 +58,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, const uint64_t* keys,
     }
 
     std::atomic<uint32_t> missed_keys{0};
-    for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
       tasks[shard_id] =
           _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
               [this, shard_id, &task_keys, value_size, mf_value_size,
@@ -67,7 +67,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, const uint64_t* keys,
                 auto& local_shard = _local_shards[shard_id];
                 float data_buffer[value_size];
                 float* data_buffer_ptr = data_buffer;
-                for (int i = 0; i < keys.size(); ++i) {
+                for (size_t i = 0; i < keys.size(); ++i) {
                   uint64_t key = keys[i].first;
                   auto itr = local_shard.find(key);
                   size_t data_size = value_size - mf_value_size;
@@ -105,7 +105,8 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, const uint64_t* keys,
                     memcpy(data_buffer_ptr, itr.value().data(),
                            data_size * sizeof(float));
                   }
-                  for (int mf_idx = data_size; mf_idx < value_size; ++mf_idx) {
+                  for (size_t mf_idx = data_size; mf_idx < value_size;
+                       ++mf_idx) {
                     data_buffer[mf_idx] = 0.0;
                   }
                   int pull_data_idx = keys[i].second;
@@ -117,7 +118,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, const uint64_t* keys,
                 return 0;
               });
     }
-    for (size_t i = 0; i < _real_local_shard_num; ++i) {
+    for (int i = 0; i < _real_local_shard_num; ++i) {
       tasks[i].wait();
     }
     if (FLAGS_pserver_print_missed_key_num_every_push) {
@@ -145,7 +146,7 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys, const float* values,
       int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num;
       task_keys[shard_id].push_back({keys[i], i});
     }
-    for (size_t shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
+    for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) {
       tasks[shard_id] =
           _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue(
               [this, shard_id, value_col, mf_value_col, update_value_col,
@@ -154,7 +155,7 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys, const float* values,
                 auto& local_shard = _local_shards[shard_id];
                 float data_buffer[value_col];
                 float* data_buffer_ptr = data_buffer;
-                for (int i = 0; i < keys.size(); ++i) {
+                for (size_t i = 0; i < keys.size(); ++i) {
                   uint64_t key = keys[i].first;
                   uint64_t push_data_idx = keys[i].second;
                   const float* update_data =
@@ -196,7 +197,7 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys, const float* values,
                 return 0;
               });
     }
-    for (size_t i = 0; i < _real_local_shard_num; ++i) {
+    for (int i = 0; i < _real_local_shard_num; ++i) {
       tasks[i].wait();
     }
   }
@@ -228,7 +229,7 @@ int32_t SSDSparseTable::Shrink(const std::string& param) {
   int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20;
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     uint64_t mem_count = 0;
     uint64_t ssd_count = 0;
 
@@ -264,7 +265,7 @@ int32_t SSDSparseTable::Shrink(const std::string& param) {
 int32_t SSDSparseTable::UpdateTable() {
   // TODO implement with multi-thread
   int count = 0;
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     auto& shard = _local_shards[i];
     // from mem to ssd
     for (auto it = shard.begin(); it != shard.end();) {
@@ -285,7 +286,7 @@ int32_t SSDSparseTable::UpdateTable() {
 
 int64_t SSDSparseTable::LocalSize() {
   int64_t local_size = 0;
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     local_size += _local_shards[i].size();
   }
   // TODO rocksdb size
@@ -328,7 +329,7 @@ int32_t SSDSparseTable::Save(const std::string& path,
 
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     FsChannelConfig channel_config;
     if (_config.compress_in_save() && (save_param == 0 || save_param == 3)) {
       channel_config.path = paddle::string::format_string(
@@ -484,14 +485,14 @@ int64_t SSDSparseTable::CacheShuffle(
   int feasign_size = 0;
   std::vector<paddle::framework::Channel<std::pair<uint64_t, std::string>>>
       tmp_channels;
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     tmp_channels.push_back(
         paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>());
   }
 
   omp_set_num_threads(thread_num);
 #pragma omp parallel for schedule(dynamic)
-  for (size_t i = 0; i < _real_local_shard_num; ++i) {
+  for (int i = 0; i < _real_local_shard_num; ++i) {
     paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>& writer =
         writers[i];
     //    std::shared_ptr<paddle::framework::ChannelObject<std::pair<uint64_t,
@@ -520,7 +521,7 @@ int64_t SSDSparseTable::CacheShuffle(
             << " and start sparse cache data shuffle real local shard num: "
             << _real_local_shard_num;
   std::vector<std::pair<uint64_t, std::string>> local_datas;
-  for (size_t idx_shard = 0; idx_shard < _real_local_shard_num; ++idx_shard) {
+  for (int idx_shard = 0; idx_shard < _real_local_shard_num; ++idx_shard) {
     paddle::framework::ChannelWriter<std::pair<uint64_t, std::string>>& writer =
         writers[idx_shard];
     auto channel = writer.channel();
@@ -543,8 +544,8 @@ int64_t SSDSparseTable::CacheShuffle(
         send_index[i] = i;
       }
       std::random_shuffle(send_index.begin(), send_index.end());
-      for (auto index = 0u; index < shuffle_node_num; ++index) {
-        int i = send_index[index];
+      for (int index = 0; index < shuffle_node_num; ++index) {
+        size_t i = send_index[index];
         if (i == _shard_idx) {
           continue;
         }
@@ -624,7 +625,7 @@ int32_t SSDSparseTable::Load(const std::string& path,
 }
 
 //加载path目录下数据[start_idx, end_idx)
-int32_t SSDSparseTable::Load(size_t start_idx, size_t end_idx,
+int32_t SSDSparseTable::Load(size_t start_idx, int end_idx,
                              const std::vector<std::string>& file_list,
                              const std::string& param) {
   if (start_idx >= file_list.size()) {
@@ -688,7 +689,7 @@ int32_t SSDSparseTable::Load(size_t start_idx, size_t end_idx,
               continue;
             }
           }
-          int value_size =
+          size_t value_size =
               _value_accesor->ParseFromString(++end, data_buffer_ptr);
           // ssd or mem
           if (_value_accesor->SaveSSD(data_buffer_ptr)) {
diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
index 2a43a27c229d1..e6be77a4ba924 100644
--- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h
@@ -55,7 +55,7 @@ class SSDSparseTable : public MemorySparseTable {
   int32_t Flush() override { return 0; }
   virtual int32_t Shrink(const std::string& param) override;
   virtual void Clear() override {
-    for (size_t i = 0; i < _real_local_shard_num; ++i) {
+    for (int i = 0; i < _real_local_shard_num; ++i) {
       _local_shards[i].clear();
     }
   }
@@ -79,7 +79,7 @@ class SSDSparseTable : public MemorySparseTable {
   virtual int32_t Load(const std::string& path,
                        const std::string& param) override;
   //加载path目录下数据[start_idx, end_idx)
-  virtual int32_t Load(size_t start_idx, size_t end_idx,
+  virtual int32_t Load(size_t start_idx, int end_idx,
                        const std::vector<std::string>& file_list,
                        const std::string& param);
   int64_t LocalSize();
diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc
index b9754d7b9debb..8d6276733e0e5 100644
--- a/paddle/fluid/distributed/ps/wrapper/fleet.cc
+++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc
@@ -536,8 +536,8 @@ void FleetWrapper::PushSparseFromTensorAsync(
     output_len = 0;
 
     if (tensor->lod().size() > 0) {
-      for (size_t i = 0; i < tensor->lod()[0].size() - 1; ++i) {
-        for (int j = tensor->lod()[0][i]; j < tensor->lod()[0][i + 1];
+      for (int i = 0; i < tensor->lod()[0].size() - 1; ++i) {
+        for (size_t j = tensor->lod()[0][i]; j < tensor->lod()[0][i + 1];
              ++j, output_len += fea_dim) {
           uint64_t real_id = static_cast<uint64_t>(ids[j]);
           if (real_id == padding_id) {
@@ -566,7 +566,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
         }
       }
     } else {
-      for (size_t i = 0; i < len; ++i, output_len += fea_dim) {
+      for (int i = 0; i < len; ++i, output_len += fea_dim) {
         uint64_t real_id = static_cast<uint64_t>(ids[i]);
         if (real_id == padding_id) {
           continue;
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index c1467dae9a7e2..b87f308aa6bee 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -222,7 +222,7 @@ void RunBrpcPushDense() {
       worker_ptr_->PullDense(temp_region.data(), temp_region.size(), 0);
   pull_status.wait();
 
-  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+  for (int64_t idx = 0; idx < tensor->numel(); ++idx) {
     EXPECT_FLOAT_EQ(temp[idx], 1.0);
   }
 
@@ -236,7 +236,7 @@ void RunBrpcPushDense() {
   pull_status = worker_ptr_->PullDense(regions.data(), regions.size(), 0);
   pull_status.wait();
 
-  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+  for (int64_t idx = 0; idx < tensor->numel(); ++idx) {
     EXPECT_FLOAT_EQ(w[idx], float(idx));
   }
 
@@ -265,7 +265,7 @@ void RunBrpcPushDense() {
       worker_ptr_->PullDense(regions.data(), regions.size(), 0);
   pull_update_status.wait();
 
-  for (size_t idx = 0; idx < tensor->numel(); ++idx) {
+  for (int64_t idx = 0; idx < tensor->numel(); ++idx) {
     EXPECT_FLOAT_EQ(w[idx], float(idx) - 1.0);
   }
 
diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
index 3a9a8d0b39ccd..f050abe2e8725 100644
--- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
+++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc
@@ -89,25 +89,25 @@ TEST(downpour_sparse_adagrad_test, test_init_and_update) {
 
   rule.InitValue(w, w + 10, true);
 
-  for (auto i = 0u; i < kEmbSize; ++i) {
+  for (int i = 0; i < kEmbSize; ++i) {
     ASSERT_FLOAT_EQ(w[i], 0);
   }
   ASSERT_FLOAT_EQ(w[kEmbSize], 0);
 
   // check init_value for random
   rule.InitValue(w, w + 10, false);
-  for (auto i = 0u; i < kEmbSize; ++i) {
+  for (int i = 0; i < kEmbSize; ++i) {
     ASSERT_TRUE(w[i] >= rule.MinBound() && w[i] <= rule.MaxBound());
   }
   ASSERT_FLOAT_EQ(w[kEmbSize], 0);
 
   // check update_value for one field
-  for (auto i = 0u; i < kEmbSize; ++i) {
+  for (int i = 0; i < kEmbSize; ++i) {
     w[i] = 0;
   }
   w[kEmbSize] = 0;
   float grad[kEmbSize];
-  for (auto i = 0u; i < kEmbSize; ++i) {
+  for (int i = 0; i < kEmbSize; ++i) {
     grad[i] = (i + 1) * 1.0;
   }
 
@@ -185,7 +185,7 @@ TEST(downpour_sparse_adam_test, test_init_and_update) {
 
   rule.UpdateValue(value, value + embed_dim, grad);
 
-  for (auto i = 0u; i < value_dim; ++i) {  // check update
+  for (int i = 0; i < value_dim; ++i) {  // check update
     ASSERT_FLOAT_EQ(value[i], label[i]) << "i is " << i;
   }
 }
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 09db68399f332..0cfe8942ae0f9 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/api/all.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
 
 namespace egr {
 
@@ -49,6 +50,22 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
           paddle::imperative::SelectedRowsAddTensor(*tensor, t, &new_buffer);
           tensor->set_impl(new_buffer.impl());
         }
+      } else if (LIKELY(t.is_sparse_coo_tensor())) {
+        // In fact, the gradient of SparseTensor is still a SparseTensor
+        if (LIKELY(tensor->is_sparse_coo_tensor())) {
+          auto t_sparse =
+              std::dynamic_pointer_cast<phi::SparseCooTensor>(t.impl());
+          paddle::experimental::Tensor t_values(
+              std::make_shared<phi::DenseTensor>(
+                  t_sparse->non_zero_elements()));
+          auto tensor_sparse =
+              std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor->impl());
+          paddle::experimental::Tensor tensor_values(
+              std::make_shared<phi::DenseTensor>(
+                  tensor_sparse->non_zero_elements()));
+          paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
+              t_values, &tensor_values);
+        }
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 6a6a443f69333..44ea47a257632 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -57,7 +57,7 @@ class Controller {
   }
   bool HasGrad() const { return tracer_->HasGrad(); }
   void SetHasGrad(bool has_grad) { tracer_->SetHasGrad(has_grad); }
-  std::string GenerateUniqueName(std::string key = "eager_tmp") {
+  std::string GenerateUniqueName(std::string key = "eager_in_tmp") {
     return tracer_->GenerateUniqueName(key);
   }
   const std::shared_ptr<paddle::imperative::Tracer>& GetCurrentTracer() {
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index aff7f057f4601..ecfb40e947f91 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -73,7 +73,7 @@ if(WIN32)
               ${eager_generator_path}
       DEPENDS mklml)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/libiomp5md.dll)
-  else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
+  else()
     message("Copied openblas.dll for Eager AutoCodeGen")
     add_custom_command(
       OUTPUT ${eager_generator_path}/openblas.dll
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 6abf759cdba7a..af2fcee6084f1 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace egr {
@@ -130,6 +131,25 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
                                                   &new_buffer);
         buffer_tensor.set_impl(new_buffer.impl());
       }
+    } else if (t.is_sparse_coo_tensor()) {
+      auto t_sparse = std::dynamic_pointer_cast<phi::SparseCooTensor>(t.impl());
+      paddle::experimental::Tensor t_values(
+          std::make_shared<phi::DenseTensor>(t_sparse->non_zero_elements()));
+      // In fact, the gradient of SparseTensor is still a SparseTensor
+      if (buffer_tensor.is_sparse_coo_tensor()) {
+        auto buffer_sparse = std::dynamic_pointer_cast<phi::SparseCooTensor>(
+            buffer_tensor.impl());
+        paddle::experimental::Tensor buffer_values(
+            std::make_shared<phi::DenseTensor>(
+                buffer_sparse->non_zero_elements()));
+        if (create_graph) {
+          buffer_values =
+              add_final_state_dygraph_function(t_values, buffer_values);
+        } else {
+          paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
+              t_values, &buffer_values);
+        }
+      }
     } else {
       // TODO(jiabin): Support Other TensorBase later
       // TODO(zhanlve): Replace SelectedRowsAddTensor with add_dygraph_function
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 5402beb49e69d..9fef2394c06a3 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -57,7 +57,7 @@ foreach(OP_DEF_FILE ${OP_DEF_FILES})
   get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
   file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
        "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
-endforeach(OP_DEF_FILE)
+endforeach()
 file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
 
 proto_library(heter_service_proto SRCS heter_service.proto)
@@ -91,7 +91,7 @@ if(WITH_GPU)
       tensor
       SRCS tensor.cc tensor_util.cu
       DEPS place memory data_type device_context profiler dense_tensor)
-  endif(WIN32)
+  endif()
 elseif(WITH_ROCM)
   hip_library(
     tensor
@@ -284,7 +284,7 @@ if(WITH_GPU)
       data_type_transform
       SRCS data_type_transform.cu
       DEPS tensor)
-  endif(WIN32)
+  endif()
   nv_test(
     data_type_transform_test
     SRCS data_type_transform_test.cc data_type_transform_test.cu
@@ -575,7 +575,7 @@ if(WITH_PYTHON)
       COMMENT
         "Copy generated python proto into directory paddle/distributed/fleet/proto."
     )
-  else(NOT WIN32)
+  else()
     string(REPLACE "/" "\\" proto_dstpath
                    "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
     string(
@@ -592,7 +592,7 @@ if(WITH_PYTHON)
       COMMENT
         "Copy generated python proto into directory paddle/distributed/fleet/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  endif(NOT WIN32)
+  endif()
 endif()
 
 if(WITH_PSCORE)
@@ -606,7 +606,7 @@ if(WITH_PSCORE)
     COMMENT
       "Copy generated python proto into directory paddle/distributed/fleet/proto."
   )
-endif(WITH_PSCORE)
+endif()
 
 cc_library(
   lod_rank_table
@@ -651,7 +651,7 @@ else()
          feed_fetch_method
          graph_to_program_pass
          variable_helper)
-endif(TENSORRT_FOUND)
+endif()
 
 cc_library(
   executor_gc_helper
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 8b5c3c1798780..ca1ebe18b44d2 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -39,9 +39,11 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       local_exec_scopes_(local_exec_scopes),
       places_(places),
       graph_(graph),
-      fetch_ctxs_(places),
       // add one more thread for generate op_deps
       prepare_pool_(1) {
+  platform::EmplaceDeviceContexts(
+      &fetch_ctxs_, places,
+      /*disable_setting_default_stream_for_allocator=*/true);
   if (ir::IsTopologySortOperationsUnique(*graph_)) {
     VLOG(10)
         << "Change thread number to 1 because the toposort order is unique";
@@ -144,7 +146,7 @@ FetchResultType FastThreadedSSAGraphExecutor::Run(
     ClearFetchOp(graph_, &fetch_ops);
 
     for (auto &place : places_) {
-      fetch_ctxs_.Get(place)->Wait();
+      fetch_ctxs_[place].get().get()->Wait();
     }
   }
 
@@ -195,7 +197,7 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
     fetch_ops->emplace_back(op);
 
     for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+      op->SetDeviceContext(p, fetch_ctxs_[p].get().get());
     }
 
     for (auto *var : vars) {
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 19b0061571596..f535a888b4e36 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -54,7 +54,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::unordered_map<OpHandleBase *, int> op_deps_;
   std::vector<OpHandleBase *> bootstrap_ops_;
 
-  platform::DeviceContextPool fetch_ctxs_;
+  std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>
+      fetch_ctxs_;
   std::atomic<int> remaining_;
 
   std::future<
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 39683c9a0d868..ef9b309c8d80f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -32,11 +32,14 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
       local_scopes_(local_scopes),
       local_exec_scopes_(local_exec_scopes),
       places_(places),
-      fetch_ctxs_(places),
       strategy_(strategy),
       prepare_pool_(1),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr) {
+  platform::EmplaceDeviceContexts(
+      &fetch_ctxs_, places,
+      /*disable_setting_default_stream_for_allocator=*/true);
+
   if (strategy_.num_iteration_per_run_ > 1) {
     int read_op_num = 0;
     for (auto *node : graph_->Nodes()) {
@@ -207,7 +210,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     fetch_ops->emplace_back(op);
 
     for (auto &p : places_) {
-      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+      op->SetDeviceContext(p, fetch_ctxs_[p].get().get());
     }
 
     for (auto *var : vars) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 45fa3adbf1408..c9a2a7eccdcde 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -77,7 +77,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_exec_scopes_;
 
   std::vector<platform::Place> places_;
-  platform::DeviceContextPool fetch_ctxs_;
+  std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>
+      fetch_ctxs_;
+
   ExceptionHolder exception_holder_;
   std::unique_ptr<OpDependentData> op_deps_;
   std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt
index 0033e825172bb..4d21c6a892349 100644
--- a/paddle/fluid/framework/io/CMakeLists.txt
+++ b/paddle/fluid/framework/io/CMakeLists.txt
@@ -13,4 +13,4 @@ cc_test(
   DEPS fs shell)
 if(WITH_CRYPTO)
   add_subdirectory(crypto)
-endif(WITH_CRYPTO)
+endif()
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 374b5490d5da1..ce522917090d8 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -84,7 +84,7 @@ cc_library(
 set(GRAPH_PATTERN_DETECTOR_DEPS graph graph_helper graph_traits)
 if(WITH_TESTING)
   set(GRAPH_PATTERN_DETECTOR_DEPS ${GRAPH_PATTERN_DETECTOR_DEPS} gtest)
-endif(WITH_TESTING)
+endif()
 cc_library(
   graph_pattern_detector
   SRCS graph_pattern_detector.cc
@@ -156,6 +156,8 @@ pass_library(add_support_int8_pass inference)
 pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
 pass_library(mixed_precision_configure_pass inference)
+pass_library(dense_fc_to_sparse_pass inference)
+pass_library(dense_multihead_matmul_to_sparse_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
@@ -379,6 +381,14 @@ if(NOT WIN32)
     test_sync_batch_norm_pass
     SRCS sync_batch_norm_pass_tester.cc
     DEPS sync_batch_norm_pass)
+  cc_test(
+    test_dense_fc_to_sparse_pass_cc
+    SRCS dense_fc_to_sparse_pass_tester.cc
+    DEPS fc_fuse_pass dense_fc_to_sparse_pass framework_proto)
+  cc_test(
+    test_dense_multihead_matmul_to_sparse_pass
+    SRCS dense_multihead_matmul_to_sparse_pass_tester.cc
+    DEPS multihead_matmul_fuse_pass dense_multihead_matmul_to_sparse_pass)
 endif()
 if(WITH_MKLDNN)
   cc_test(
diff --git a/paddle/fluid/framework/ir/dense_fc_to_sparse_pass.cc b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass.cc
new file mode 100644
index 0000000000000..f1a8d63c72241
--- /dev/null
+++ b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/dense_fc_to_sparse_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+PDNode *patterns::DenseFC::operator()() {
+  auto *fc = pattern->NewNode(fc_repr())->assert_is_op("fc");
+  // Input
+  auto *fc_input = pattern->NewNode(fc_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("fc", "Input");
+  // Filter
+  auto *fc_weights = pattern->NewNode(fc_weights_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("fc", "W");
+  // Bias
+  auto *fc_bias = pattern->NewNode(fc_bias_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("fc", "Bias");
+  // Output
+  auto *fc_out = pattern->NewNode(fc_out_repr())
+                     ->AsOutput()
+                     ->assert_is_op_output("fc", "Out")
+                     ->assert_is_only_output_of_op("fc");
+
+  fc->LinksFrom({fc_input, fc_weights, fc_bias}).LinksTo({fc_out});
+
+  return fc_out;
+}
+}  // namespace patterns
+
+DenseFCToSparsePass::DenseFCToSparsePass() {
+  AddOpCompat(OpCompat("fc"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+}
+
+void DenseFCToSparsePass::ApplyImpl(Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  std::string name_scope = "dense_fc_to_sparse_pass";
+  FusePassBase::Init(name_scope, graph);
+  GraphPatternDetector gpd;
+
+  patterns::DenseFC dense_fc_pattern(gpd.mutable_pattern(),
+                                     "dense_fc_replace_pass");
+  dense_fc_pattern();
+  int found_dense_fc_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Replace dense fc with sparse_fc.";
+
+    /*   if (!IsCompat(subgraph, g)) {
+         LOG(WARNING) << "Pass in op compat failed.";
+         return;
+       }*/
+
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, dense_fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, dense_fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, dense_fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_weights, fc_weights, dense_fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_bias, fc_bias, dense_fc_pattern);
+
+    auto *fc_op = fc->Op();
+    auto w_name = fc_op->Input("W")[0];
+    // recognize sparse op by name
+    if (w_name.find("sparse_2_4") != w_name.npos) {
+      // fake op
+      OpDesc desc(fc_op->Block());
+      desc.SetType("sparse_fc");
+      desc.SetInput("Input", {fc_input->Name()});
+      desc.SetInput("W", {fc_weights->Name()});
+      desc.SetInput("Bias", {fc_bias->Name()});
+      desc.SetOutput("Out", {fc_out->Name()});
+
+      // copy all attr
+      if (fc_op->HasAttr("x_num_col_dims")) {
+        desc.SetAttr("x_num_col_dims", fc_op->GetAttr("x_num_col_dims"));
+      }
+      if (fc_op->HasAttr("in_num_col_dims")) {
+        desc.SetAttr("in_num_col_dims", fc_op->GetAttr("in_num_col_dims"));
+      }
+      desc.SetAttr("activation_type", fc_op->GetAttr("activation_type"));
+      if (fc_op->HasAttr("enable_int8")) {
+        desc.SetAttr("enable_int8", fc_op->GetAttr("enable_int8"));
+      }
+      if (fc_op->HasAttr("Input_scale")) {
+        desc.SetAttr("Input_scale", fc_op->GetAttr("Input_scale"));
+      }
+      if (fc_op->HasAttr("support_int8")) {
+        desc.SetAttr("support_int8", fc_op->GetAttr("support_int8"));
+      }
+      if (fc_op->HasAttr("out_threshold")) {
+        desc.SetAttr("out_threshold", fc_op->GetAttr("out_threshold"));
+      }
+      desc.Flush();
+      GraphSafeRemoveNodes(g, {fc});
+      auto sparse_fc_node = g->CreateOpNode(&desc);
+
+      IR_NODE_LINK_TO(fc_input, sparse_fc_node);
+      IR_NODE_LINK_TO(fc_weights, sparse_fc_node);
+      IR_NODE_LINK_TO(fc_bias, sparse_fc_node);
+      IR_NODE_LINK_TO(sparse_fc_node, fc_out);
+      found_dense_fc_count++;
+    }
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_dense_fc_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(dense_fc_to_sparse_pass,
+              paddle::framework::ir::DenseFCToSparsePass);
diff --git a/paddle/fluid/framework/ir/dense_fc_to_sparse_pass.h b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass.h
new file mode 100644
index 0000000000000..18c91bf49c732
--- /dev/null
+++ b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct DenseFC : public PatternBase {
+  DenseFC(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dense_fc") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fc);
+  PATTERN_DECL_NODE(fc_out);
+  PATTERN_DECL_NODE(fc_input);
+  PATTERN_DECL_NODE(fc_weights);
+  PATTERN_DECL_NODE(fc_bias);
+};
+}  // namespace patterns
+
+/**
+ * Replace dense op with sparse op
+ */
+class Graph;
+
+class DenseFCToSparsePass : public FusePassBase {
+ public:
+  DenseFCToSparsePass();
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+  const std::string name_scope_{"dense_fc_to_sparse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc
new file mode 100644
index 0000000000000..cb10c84b1d770
--- /dev/null
+++ b/paddle/fluid/framework/ir/dense_fc_to_sparse_pass_tester.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/dense_fc_to_sparse_pass.h"
+#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope, const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "conv2d_filters_0", {});
+  AddVarToScope(param_scope, "conv2d_bias_0", {});
+  AddVarToScope(param_scope, "weights_0_sparse_2_4", {});
+  AddVarToScope(param_scope, "weights_1", {});
+  AddVarToScope(param_scope, "bias_1", {});
+  AddVarToScope(param_scope, "bias_2", {});
+  return param_scope;
+}
+
+TEST(FCFusePass, basic) {
+  // inputs                     operator            output
+  // --------------------------------------------------------
+  // (a, filters_0 bias_0)      conv2d           -> conv2d_out
+  // conv2d_out                 relu             -> relu_out_0
+  // (relu_out_0, weights_0_sparse_2_4)    mul              -> mul_out_0
+  // (mul_out_0, bias_1)        elementwise_add  -> add_out_0
+  // add_out_0                  relu             -> relu_out_1
+  // (relu_out_1, weights_1)    mul              -> mul_out_1
+  // (mul_out_1, bias_2)        elementwise_add  -> add_out_1
+  Layers layers;
+  auto* a = layers.data("a");
+  auto* filters_0 = layers.data("conv2d_filters_0", {}, true);
+  auto* bias_0 = layers.data("conv2d_bias_0", {}, true);
+  auto* conv2d_out = layers.conv2d(a, filters_0, bias_0, false);
+  auto* relu_out_0 = layers.relu(conv2d_out);
+  auto* weights_0 = layers.data("weights_0_sparse_2_4", {5, 4}, true);
+  auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
+  auto* bias_1 = layers.data("bias_1", {4}, true);
+  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
+  auto* relu_out_1 = layers.relu(add_out_0);
+  auto* weights_1 = layers.data("weights_1", {8, 9}, true);
+  auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
+  auto* bias_2 = layers.data("bias_2", {1, 9}, true);
+  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
+  VLOG(4) << add_out_1;
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  auto fuse_pass = PassRegistry::Instance().Get("fc_fuse_pass");
+  auto sparse_pass = PassRegistry::Instance().Get("dense_fc_to_sparse_pass");
+  fuse_pass->Set("use_gpu", new bool(true));
+  sparse_pass->Set("use_gpu", new bool(true));
+  graph->Set("__param_scope__", CreateParamScope());
+  int num_nodes_before = graph->Nodes().size();
+  int num_mul_nodes_before = GetNumOpNodes(graph, "mul");
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(fuse_pass->Apply(graph.release()));
+  graph.reset(sparse_pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  int num_fc_nodes_after = GetNumOpNodes(graph, "fc");
+  int num_sparse_fc_nodes_after = GetNumOpNodes(graph, "sparse_fc");
+  VLOG(3) << DebugString(graph);
+
+  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6,
+                    platform::errors::InvalidArgument(
+                        "num_nodes_before=%d, num_nodes_after=%d.",
+                        num_nodes_before, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fc_nodes_after, 1,
+                    platform::errors::InvalidArgument("num_fc_nodes_after=%d.",
+                                                      num_fc_nodes_after));
+  PADDLE_ENFORCE_EQ(
+      num_mul_nodes_before, num_fc_nodes_after + num_sparse_fc_nodes_after,
+      platform::errors::InvalidArgument(
+          "num_mul_nodes_before=%d, num_fc_nodes_after=%d + "
+          "num_sparse_fc_nodes_after=%d.",
+          num_mul_nodes_before, num_fc_nodes_after, num_sparse_fc_nodes_after));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(dense_fc_to_sparse_pass);
diff --git a/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc
new file mode 100644
index 0000000000000..2aae5030b5d63
--- /dev/null
+++ b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+PDNode *patterns::DenseMultiheadMatmul::operator()() {
+  auto *multihead_matmul = pattern->NewNode(multihead_matmul_repr())
+                               ->assert_is_op("multihead_matmul");
+  // Input
+  auto *multihead_matmul_input =
+      pattern->NewNode(multihead_matmul_input_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "Input");
+  // Filter
+  auto *multihead_matmul_weights =
+      pattern->NewNode(multihead_matmul_weights_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "W");
+  // Bias
+  auto *multihead_matmul_bias =
+      pattern->NewNode(multihead_matmul_bias_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "Bias");
+  // BiasQK
+  auto *multihead_matmul_biasqk =
+      pattern->NewNode(multihead_matmul_biasqk_repr())
+          ->AsInput()
+          ->assert_is_op_input("multihead_matmul", "BiasQK");
+  // Output
+  auto *multihead_matmul_out =
+      pattern->NewNode(multihead_matmul_out_repr())
+          ->AsOutput()
+          ->assert_is_op_output("multihead_matmul", "Out")
+          ->assert_is_only_output_of_op("multihead_matmul");
+
+  multihead_matmul
+      ->LinksFrom({multihead_matmul_input, multihead_matmul_weights,
+                   multihead_matmul_bias, multihead_matmul_biasqk})
+      .LinksTo({multihead_matmul_out});
+
+  return multihead_matmul_out;
+}
+}  // namespace patterns
+DenseMultiheadMatmulToSparsePass::DenseMultiheadMatmulToSparsePass() {
+  AddOpCompat(OpCompat("multihead_matmul"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddInput("BiasQK")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+}
+
+void DenseMultiheadMatmulToSparsePass::ApplyImpl(Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+
+  std::string name_scope = "dense_multihead_matmul_to_sparse_pass";
+  FusePassBase::Init(name_scope, graph);
+  GraphPatternDetector gpd;
+
+  patterns::DenseMultiheadMatmul multihead_matmul_pattern(
+      gpd.mutable_pattern(), "dense_multihead_matmul_replace_pass");
+  multihead_matmul_pattern();
+  int found_multihead_matmul_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Replace dense multihead matmul with sparse multihead matmul.";
+
+    /*   if (!IsCompat(subgraph, g)) {
+         LOG(WARNING) << "Pass in op compat failed.";
+         return;
+       }*/
+
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul, multihead_matmul,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_input, multihead_matmul_input,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_weights,
+                              multihead_matmul_weights,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_bias, multihead_matmul_bias,
+                              multihead_matmul_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_biasqk, multihead_matmul_biasqk,
+                              multihead_matmul_pattern);
+
+    auto *multihead_matmul_op = multihead_matmul->Op();
+    auto w_name = multihead_matmul_op->Input("W")[0];
+    // recognize sparse op by name
+    if (w_name.find("sparse_2_4") != w_name.npos) {
+      // fake op
+      OpDesc desc(multihead_matmul_op->Block());
+      desc.SetType("sparse_multihead_matmul");
+      desc.SetInput("Input", {multihead_matmul_input->Name()});
+      desc.SetInput("W", {multihead_matmul_weights->Name()});
+      desc.SetInput("Bias", {multihead_matmul_bias->Name()});
+      desc.SetInput("BiasQK", {multihead_matmul_biasqk->Name()});
+      desc.SetOutput("Out", {multihead_matmul_out->Name()});
+
+      // copy all attr
+      desc.SetAttr("alpha", multihead_matmul_op->GetAttr("alpha"));
+      desc.SetAttr("head_number", multihead_matmul_op->GetAttr("head_number"));
+      if (multihead_matmul_op->HasAttr("Input_scale")) {
+        desc.SetAttr("Input_scale",
+                     multihead_matmul_op->GetAttr("Input_scale"));
+      }
+      if (multihead_matmul_op->HasAttr("fc_out_threshold")) {
+        desc.SetAttr("fc_out_threshold",
+                     multihead_matmul_op->GetAttr("fc_out_threshold"));
+      }
+      if (multihead_matmul_op->HasAttr("qkv2context_plugin_int8")) {
+        desc.SetAttr("qkv2context_plugin_int8",
+                     multihead_matmul_op->GetAttr("qkv2context_plugin_int8"));
+      }
+      if (multihead_matmul_op->HasAttr("dp_probs")) {
+        desc.SetAttr("dp_probs", multihead_matmul_op->GetAttr("dp_probs"));
+      }
+      if (multihead_matmul_op->HasAttr("out_threshold")) {
+        desc.SetAttr("out_threshold",
+                     multihead_matmul_op->GetAttr("out_threshold"));
+      }
+      desc.Flush();
+      GraphSafeRemoveNodes(g, {multihead_matmul});
+      auto sparse_multihead_matmul_node = g->CreateOpNode(&desc);
+
+      IR_NODE_LINK_TO(multihead_matmul_input, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(multihead_matmul_weights, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(multihead_matmul_bias, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(multihead_matmul_biasqk, sparse_multihead_matmul_node);
+      IR_NODE_LINK_TO(sparse_multihead_matmul_node, multihead_matmul_out);
+      found_multihead_matmul_count++;
+    }
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_multihead_matmul_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(dense_multihead_matmul_to_sparse_pass,
+              paddle::framework::ir::DenseMultiheadMatmulToSparsePass);
diff --git a/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.h b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.h
new file mode 100644
index 0000000000000..fa0716255b59e
--- /dev/null
+++ b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct DenseMultiheadMatmul : public PatternBase {
+  DenseMultiheadMatmul(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dense_multihead_matmul") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(multihead_matmul);
+  PATTERN_DECL_NODE(multihead_matmul_out);
+  PATTERN_DECL_NODE(multihead_matmul_input);
+  PATTERN_DECL_NODE(multihead_matmul_weights);
+  PATTERN_DECL_NODE(multihead_matmul_bias);
+  PATTERN_DECL_NODE(multihead_matmul_biasqk);
+};
+}  // namespace patterns
+/**
+ * Replace dense multihead_matmul op with sparse multihead_matmul op
+ */
+class Graph;
+
+class DenseMultiheadMatmulToSparsePass : public FusePassBase {
+ public:
+  DenseMultiheadMatmulToSparsePass();
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+  const std::string name_scope_{"dense_multihead_matmul_to_sparse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass_tester.cc b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass_tester.cc
new file mode 100644
index 0000000000000..3989d3d11db3f
--- /dev/null
+++ b/paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass_tester.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/dense_multihead_matmul_to_sparse_pass.h"  // NOLINT
+#include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope, const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "weights0_sparse_2_4", {768, 768});
+  AddVarToScope(param_scope, "weights1_sparse_2_4", {768, 768});
+  AddVarToScope(param_scope, "weights2_sparse_2_4", {768, 768});
+
+  AddVarToScope(param_scope, "bias_0", {768});
+  AddVarToScope(param_scope, "bias_1", {768});
+  AddVarToScope(param_scope, "bias_2", {768});
+  AddVarToScope(param_scope, "biasqk", {768});
+  AddVarToScope(param_scope, "weightsl", {768, 768});
+  return param_scope;
+}
+
+TEST(DenseMultiHeadMatmulToSparsePass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x)                              layer_norm       -> layer_norm_out
+  // (layer_norm_out, weights_0_sparse_2_4)      mul              -> mul_out0
+  // (layer_norm_out, weights_1_sparse_2_4)      mul              -> mul_out1
+  // (layer_norm_out, weights_2_sparse_2_4)      mul              -> mul_out2
+  // (mul_out0, bias_0)               elementweise_add -> eltadd_0
+  // (mul_out1, bias_1)               elementweise_add -> eltadd_1
+  // (mul_out2, bias_2)               elementweise_add -> eltadd_2
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (eltadd_1)                       reshape2         -> reshape_1
+  // (eltadd_2)                       reshape2         -> reshape_2
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (reshape_1)                      transpose2       -> transpose_1
+  // (reshape_2)                      transpose2       -> transpose_2
+  // (transpose_0)                    scale            -> scale_0
+  // (scale_0, transpose_1)           matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul           -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    mul              -> mul_qkv
+  Layers layers;
+  auto* x = layers.data("x", {1, 128, 768});
+  auto out = layers.layer_norm(x);
+  auto* layer_out = out[0];
+
+  auto* weights_0 = layers.data("weights0_sparse_2_4", {768, 768}, true);
+  auto* weights_1 = layers.data("weights1_sparse_2_4", {768, 768}, true);
+  auto* weights_2 = layers.data("weights2_sparse_2_4", {768, 768}, true);
+
+  auto* mul_out_0 = layers.mul(layer_out, weights_0, nullptr, 2);
+  auto* mul_out_1 = layers.mul(layer_out, weights_1, nullptr, 2);
+  auto* mul_out_2 = layers.mul(layer_out, weights_2, nullptr, 2);
+
+  auto* b0 = layers.data("bias_0", {768}, true);
+  auto* b1 = layers.data("bias_1", {768}, true);
+  auto* b2 = layers.data("bias_2", {768}, true);
+
+  auto* elementwise_out_0 = layers.elementwise_add(mul_out_0, b0, nullptr, 2);
+  auto* elementwise_out_1 = layers.elementwise_add(mul_out_1, b1, nullptr, 2);
+  auto* elementwise_out_2 = layers.elementwise_add(mul_out_2, b2, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 12, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+  auto* reshape_1 = layers.reshape2(elementwise_out_1, shape, true);
+  auto* reshape_2 = layers.reshape2(elementwise_out_2, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+  auto* transpose_1 = layers.transpose2(reshape_1, axis, true);
+  auto* transpose_2 = layers.transpose2(reshape_2, axis, true);
+
+  auto* scale_0 = layers.scale(transpose_0, 0.125, 0, false);
+  auto* matmul_qk = layers.matmul(scale_0, transpose_1, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  auto* matmul_qkv = layers.matmul(softmax_qk, transpose_2);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 768}, true);
+  auto* weights_l = layers.data("weightsl", {768, 768}, true);
+  layers.mul(reshape_qkv_out, weights_l, nullptr, 2);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto fuse_pass =
+      PassRegistry::Instance().Get("multihead_matmul_fuse_pass_v2");
+  auto sparse_pass =
+      PassRegistry::Instance().Get("dense_multihead_matmul_to_sparse_pass");
+
+  if (fuse_pass.get() == nullptr || sparse_pass.get() == nullptr)
+    LOG(INFO) << "asdfasdf";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(fuse_pass->Apply(graph.release()));
+  graph.reset(sparse_pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  int num_fused_nodes_after = GetNumOpNodes(graph, "sparse_multihead_matmul");
+  VLOG(3) << DebugString(graph);
+
+  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 39,
+                    platform::errors::InvalidArgument(
+                        "After the multihead_matmul pass and sparse pass, The "
+                        "node num in graph "
+                        "should be %d, but the result is %d",
+                        num_nodes_before - 39, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1,
+                    platform::errors::InvalidArgument(
+                        "After the multihead_matmul pass and sparse pass, "
+                        "there should be one "
+                        "sparse_multihead_matmul op, but the result is %d",
+                        num_fused_nodes_after));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(multihead_matmul_fuse_pass);
+USE_PASS(multihead_matmul_fuse_pass_v2);
+USE_PASS(dense_multihead_matmul_to_sparse_pass);
diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
index 3d60148c170f9..96f115b282250 100644
--- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
+++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc
@@ -34,7 +34,16 @@ void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const {
       detector.mutable_pattern()
           ->NewNode("scale_in")
           ->assert_is_op_input("scale")
-          ->assert_more([](Node* x) { return x->outputs.size() == 1UL; });
+          ->assert_has_n_outputs(1)
+          ->assert_more([](Node* x) {
+            for (auto* op : x->inputs) {
+              auto op_type = op->Op()->Type();
+              if (op_type == "conditional_block" || op_type == "while") {
+                return false;
+              }
+            }
+            return true;
+          });
   auto scale_op = detector.mutable_pattern()
                       ->NewNode("scale_fuse")
                       ->assert_is_op("scale")
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 5f92a4bb7f15b..eb7ec82e242ea 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -353,10 +353,9 @@ bool QuantDequantMkldnnPass::IsInt8Weight(
   auto* op_desc = op_node->Op();
   auto var_name = op_desc->Input(weight_name)[0];
   auto* var = scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound(
-               "The input persistable [%s] var of [%s] op is not found.",
-               var_name, op_desc->Type()));
+  if (var == nullptr) {
+    return false;
+  }
   auto* weight_tensor = var->GetMutable<LoDTensor>();
   auto* weight_data = weight_tensor->data<float>();
   bool is_int8 = true;
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 0b75964b94e91..72b7477f2b870 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -629,46 +629,75 @@ void update_var_min_rw_op(const std::map<int, std::set<int>>& op2dependences,
   var2min_rw_op->at(rw_var).push_back(cur_op);
 }
 
-std::map<int, std::list<int>> get_downstream_map(
-    const std::map<int, std::set<int>>& op2dependences,
-    std::vector<std::vector<bool>>* op_happens_before) {
-  // step1: convert op2dependences to downstream_map directly
-  // op2dependences is op -> it's dependences.
-  // we want to get op -> [next ops] map,
-  // where ops is the next instruction of op.
-  std::map<int, std::list<int>> downstream;
+void AddDownstreamOp(int prior_op_idx, int posterior_op_idx,
+                     std::map<int, std::list<int>>* op_downstream_map) {
+  if (op_downstream_map->find(prior_op_idx) == op_downstream_map->end()) {
+    op_downstream_map->emplace(std::make_pair(prior_op_idx, std::list<int>()));
+  }
+  op_downstream_map->at(prior_op_idx).push_back(posterior_op_idx);
+}
+
+void AddDownstreamOp(int prior_op_idx, int posterior_op_idx,
+                     std::map<int, std::list<int>>* op_downstream_map,
+                     const std::vector<std::vector<bool>>& op_happens_before) {
+  if (op_downstream_map->find(prior_op_idx) != op_downstream_map->end()) {
+    for (int op_idx : op_downstream_map->at(prior_op_idx)) {
+      if (op_happens_before[op_idx][posterior_op_idx]) {
+        VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx
+                << "->" << posterior_op_idx << ", skip adding " << prior_op_idx
+                << "->" << posterior_op_idx;
+        return;
+      }
+    }
+  }
+
+  AddDownstreamOp(prior_op_idx, posterior_op_idx, op_downstream_map);
+}
+
+size_t CountDownstreamMap(const std::map<int, std::list<int>>& downstream_map) {
+  size_t count = 0;
+  for (auto pair : downstream_map) {
+    count += pair.second.size();
+  }
+  return count;
+}
+
+const std::string StringizeDownstreamMap(
+    const std::map<int, std::list<int>>& downstream_map) {
+  std::ostringstream oss;
+  for (auto pair : downstream_map) {
+    oss << pair.first << " -> ";
+    std::copy(pair.second.begin(), pair.second.end(),
+              std::ostream_iterator<int>(oss, " "));
+    oss << std::endl;
+  }
+  return oss.str();
+}
+
+// convert op2dependences to downstream_map directly. op2dependences is op ->
+// it's dependences, we want to get op -> [next ops] map, where ops is the next
+// instruction of op.
+std::map<int, std::list<int>> GetDownstreamMap(
+    const std::map<int, std::set<int>>& op2dependences) {
+  std::map<int, std::list<int>> downstream_map;
   for (auto& item : op2dependences) {
     int op = item.first;
     for (auto dep_op : item.second) {
-      if (downstream.find(dep_op) == downstream.end())
-        downstream[dep_op] = std::list<int>();
-      downstream[dep_op].push_back(op);
+      AddDownstreamOp(dep_op, op, &downstream_map);
     }
   }
 
-  auto downstream_map_to_str = [&]() -> std::string {
-    std::ostringstream oss;
-    for (auto pair : downstream) {
-      oss << pair.first << " -> ";
-      std::copy(pair.second.begin(), pair.second.end(),
-                std::ostream_iterator<int>(oss, " "));
-      oss << std::endl;
-    }
-    return oss.str();
-  };
-
-  auto downstream_map_count = [&]() -> size_t {
-    size_t count = 0;
-    for (auto pair : downstream) {
-      count += pair.second.size();
-    }
-    return count;
-  };
+  VLOG(6) << "downstream count: " << CountDownstreamMap(downstream_map);
+  VLOG(6) << "downstream_map: " << std::endl
+          << StringizeDownstreamMap(downstream_map);
 
-  VLOG(6) << "downstream count: " << downstream_map_count();
-  VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
+  return downstream_map;
+}
 
-  // step2: remove unnecessary downstream ops
+void ShrinkDownstreamMap(std::map<int, std::list<int>>* downstream_map,
+                         std::vector<std::vector<bool>>* op_happens_before,
+                         size_t op_num) {
+  // remove unnecessary downstream ops
   // for example, a->b->c
   // a: b, c
   // b: c
@@ -676,9 +705,6 @@ std::map<int, std::list<int>> get_downstream_map(
   // a: b
   // b: c
 
-  // NOTE(zhiqiu): the size of downstream != size of op2dependences
-  // since there are some ops that have no downstream-op.
-  auto op_num = op2dependences.size();
   // happens_before[i][j] means i should be executed before j
   op_happens_before->resize(op_num);
   for (size_t i = 0; i < op_num; ++i) {
@@ -696,10 +722,10 @@ std::map<int, std::list<int>> get_downstream_map(
       size_t op = q.front();
       q.pop();
       visited[op] = true;
-      if (!downstream.count(op)) {
+      if (!downstream_map->count(op)) {
         continue;
       }
-      for (auto next : downstream[op]) {
+      for (auto next : downstream_map->at(op)) {
         if (!visited[next]) {
           PADDLE_ENFORCE_EQ((*op_happens_before)[next][op_idx], false,
                             paddle::platform::errors::AlreadyExists(
@@ -721,11 +747,15 @@ std::map<int, std::list<int>> get_downstream_map(
   // shrink, find the downstream op that has no other op in the
   // downstream list happens before it
   for (size_t i = 0; i < op_num; ++i) {
+    if (downstream_map->find(i) == downstream_map->end()) {
+      continue;
+    }
+
     std::list<int> minumum_nexts;
-    for (size_t item : downstream[i]) {
+    for (size_t item : downstream_map->at(i)) {
       bool not_after_any = true;
       // find the op that is not executed after any
-      for (size_t other_item : downstream[i]) {
+      for (size_t other_item : downstream_map->at(i)) {
         if ((*op_happens_before)[other_item][item]) {
           VLOG(8) << "happens_before: " << other_item << "->" << item
                   << ", so skip " << item;
@@ -738,12 +768,11 @@ std::map<int, std::list<int>> get_downstream_map(
         minumum_nexts.push_back(item);
       }
     }
-    downstream[i] = minumum_nexts;
+    downstream_map->at(i) = minumum_nexts;
   }
-  VLOG(6) << "downstream count: " << downstream_map_count();
-  VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
-
-  return downstream;
+  VLOG(6) << "downstream count: " << CountDownstreamMap(*downstream_map);
+  VLOG(6) << "downstream_map: " << std::endl
+          << StringizeDownstreamMap(*downstream_map);
 }
 
 std::map<int, std::list<int>> build_op_downstream_map(
@@ -825,6 +854,14 @@ std::map<int, std::list<int>> build_op_downstream_map(
     }
   }
 
+  // NOTE(zhiqiu): the size of downstream != size of op2dependences since there
+  // are some ops that have no downstream-op.
+  std::map<int, std::list<int>> op_downstream_map =
+      GetDownstreamMap(op2dependences);
+
+  ShrinkDownstreamMap(&op_downstream_map, op_happens_before,
+                      vec_instruction.size());
+
   // add dependences for random op, make sure that the random op is scheduled
   // sequentially
   const std::set<std::string> random_op_set = {
@@ -846,7 +883,8 @@ std::map<int, std::list<int>> build_op_downstream_map(
   for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
     if (random_op_set.count(vec_instruction[op_idx].OpBase()->Type())) {
       if (dependence_op_idx != -1) {
-        op2dependences[op_idx].insert(dependence_op_idx);
+        AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map,
+                        *op_happens_before);
       }
       dependence_op_idx = op_idx;
     }
@@ -872,7 +910,8 @@ std::map<int, std::list<int>> build_op_downstream_map(
   for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) {
     if (is_comm_op(vec_instruction[op_idx].OpBase()->Type())) {
       if (dependence_op_idx != -1) {
-        op2dependences[op_idx].insert(dependence_op_idx);
+        AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map,
+                        *op_happens_before);
         VLOG(4) << "Add depend from "
                 << vec_instruction[dependence_op_idx].OpBase()->Type() << " to "
                 << vec_instruction[op_idx].OpBase()->Type();
@@ -900,7 +939,8 @@ std::map<int, std::list<int>> build_op_downstream_map(
         VLOG(4) << "Add depend from "
                 << vec_instruction[dependence_op_idx].OpBase()->Type() << " to "
                 << vec_instruction[op_idx].OpBase()->Type();
-        op2dependences[op_idx].insert(dependence_op_idx);
+        AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map,
+                        *op_happens_before);
       }
     }
   }
@@ -956,7 +996,8 @@ std::map<int, std::list<int>> build_op_downstream_map(
            j < static_cast<size_t>(first_read_fused_out_op); ++j) {
         for (auto var_id : outputs) {
           if (is_write(vec_instruction[j], var_id)) {
-            op2dependences[first_read_fused_out_op].insert(j);
+            AddDownstreamOp(j, first_read_fused_out_op, &op_downstream_map,
+                            *op_happens_before);
             VLOG(4) << j << " -> " << first_read_fused_out_op;
             VLOG(4)
                 << "Add depend from " << vec_instruction[j].OpBase()->Type()
@@ -990,6 +1031,7 @@ std::map<int, std::list<int>> build_op_downstream_map(
 
         for (auto var_id : outputs) {
           if (is_read(vec_instruction[j], var_id)) {
+            AddDownstreamOp(target, j, &op_downstream_map, *op_happens_before);
             op2dependences[j].insert(target);
             VLOG(4) << target << " -> " << j;
             VLOG(4) << "Add depend from "
@@ -1000,14 +1042,12 @@ std::map<int, std::list<int>> build_op_downstream_map(
       }
     }
   }
-  for (auto pair : op2dependences) {
-    std::ostringstream oss;
-    oss << pair.first << " Depends on " << pair.second.size() << " ops: ";
-    std::copy(pair.second.begin(), pair.second.end(),
-              std::ostream_iterator<int>(oss, " "));
-    VLOG(10) << oss.str();
-  }
-  return get_downstream_map(op2dependences, op_happens_before);
+
+  VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map);
+  VLOG(8) << "downstream_map: " << std::endl
+          << StringizeDownstreamMap(op_downstream_map);
+
+  return op_downstream_map;
 }
 
 }  // namespace interpreter
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index 6c689c8548b90..469876b01f654 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -14,11 +14,31 @@
 
 #include "paddle/fluid/framework/new_executor/stream_analyzer.h"
 
+#include <future>
 #include <unordered_set>
 
+#include "paddle/fluid/platform/device_context.h"
+
 namespace paddle {
 namespace framework {
 
+StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) {
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    platform::EmplaceDeviceContexts(
+        &d2h_ctxs_, {place},
+        /*disable_setting_default_stream_for_allocator=*/true);
+    platform::EmplaceDeviceContexts(
+        &h2d_ctxs_, {place},
+        /*disable_setting_default_stream_for_allocator=*/true);
+#else
+    PADDLE_THROW(
+        platform::errors::Unimplemented("CUDAPlace is not supported. Please "
+                                        "re-compile with WITH_GPU option."));
+#endif
+  }
+}
+
 /*
  * Parse the var_ids that need to be associated with an event.
  * The caller should guarantee front_op and back_op satisfy the
@@ -137,10 +157,10 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
   auto* dev_ctx = op_func_node.dev_ctx_;
   if (op_type == interpreter::kMemcpyD2H) {
     VLOG(3) << "Get dev_ctx from d2h_context_pool_";
-    dev_ctx = d2h_ctx_pool_.Get(place_);
+    dev_ctx = d2h_ctxs_[place_].get().get();
   } else if (op_type == interpreter::kMemcpyH2D) {
     VLOG(3) << "Get dev_ctx from h2d_context_pool_";
-    dev_ctx = h2d_ctx_pool_.Get(place_);
+    dev_ctx = h2d_ctxs_[place_].get().get();
   }
 
   return dev_ctx;
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index 8a6552c6883c5..c57bab9c9c2d0 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <future>
 #include <memory>
 #include <vector>
 
@@ -25,15 +26,17 @@ namespace framework {
 
 class StreamAnalyzer {
  public:
-  explicit StreamAnalyzer(const platform::Place& place)
-      : place_(place), d2h_ctx_pool_({place}), h2d_ctx_pool_({place}) {}
+  using Place = platform::Place;
+  using DeviceContext = platform::DeviceContext;
+
+  explicit StreamAnalyzer(const Place& place);
 
   ~StreamAnalyzer() {}
 
   void Schedule(const std::vector<size_t>& downstream_ops,
                 std::vector<Instruction>* instructions, size_t op_index);
 
-  platform::DeviceContext* ParseDeviceContext(const OpFuncNode& op_func_node);
+  DeviceContext* ParseDeviceContext(const OpFuncNode& op_func_node);
 
  private:
   std::vector<size_t> GetNeedEventVarIds(const Instruction& cur_instr,
@@ -42,16 +45,16 @@ class StreamAnalyzer {
   void ConstructEventForVar(const std::vector<size_t>& new_event_var_id,
                             Instruction* next_instr,
                             platform::DeviceType waiter_type,
-                            const platform::Place& place);
+                            const Place& place);
 
   bool IsDirectRun(Instruction& cur_instr,  // NOLINT
                    const Instruction& next_instr);
 
   platform::DeviceType GetWaiterType(const Instruction& instr);
 
-  platform::Place place_;
-  platform::DeviceContextPool d2h_ctx_pool_;
-  platform::DeviceContextPool h2d_ctx_pool_;
+  Place place_;
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> d2h_ctxs_;
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> h2d_ctxs_;
   std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
 };
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 00d48098a13f6..6f8621d30ec05 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -793,8 +793,8 @@ void ParallelExecutor::BCastParamsToDevices(
       std::vector<void *> buffers;
       buffers.reserve(member_->places_.size());
       size_t numel = main_tensor.numel();
-      ncclDataType_t data_type = platform::ToNCCLDataType(
-          framework::TransToProtoVarType(main_tensor.dtype()));
+      auto dtype = framework::TransToProtoVarType(main_tensor.dtype());
+      ncclDataType_t data_type = platform::ToNCCLDataType(dtype);
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
@@ -815,7 +815,7 @@ void ParallelExecutor::BCastParamsToDevices(
                             "variables' buffer size to bcast is %d, which is "
                             "NOT equal to places size %d",
                             buffers.size(), member_->places_.size()));
-      {
+      if (member_->nccl_ctxs_ != nullptr) {
         auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
@@ -824,6 +824,22 @@ void ParallelExecutor::BCastParamsToDevices(
                                        nccl_ctx.comm_, nccl_ctx.stream());
         }
         nccl_ctxs->WaitAll();
+      } else {
+        auto src_place = member_->places_[0];
+        auto src_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+            platform::DeviceContextPool::Instance().Get(src_place));
+        auto sizeof_dtype = framework::SizeOfType(dtype) * numel;
+        for (size_t i = 1; i < member_->places_.size(); ++i) {
+          auto dst_place = member_->places_[i];
+          auto dst_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+              platform::DeviceContextPool::Instance().Get(dst_place));
+          src_dev_ctx->Wait();
+          dst_dev_ctx->Wait();
+          memory::Copy(dst_place, buffers[i], src_place, buffers[0],
+                       sizeof_dtype, src_dev_ctx->stream());
+          src_dev_ctx->Wait();
+          dst_dev_ctx->Wait();
+        }
       }
 #endif
     } else if (paddle::platform::is_xpu_place(main_tensor.place())) {
@@ -1348,6 +1364,11 @@ std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
 }
 
 void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
+  if (member_->build_strategy_.reduce_ ==
+      BuildStrategy::ReduceStrategy::kNoReduce) {
+    return;
+  }
+
   if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index eaf0a09541d77..1c2874494052b 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -156,7 +156,7 @@ if(NOT WIN32)
     data_loader
     SRCS data_loader.cc
     DEPS enforce)
-endif(NOT WIN32)
+endif()
 if(WITH_GLOO)
   cc_library(
     imperative_gloo_context
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 5084363b9c135..5bb32674df78b 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -34,7 +34,7 @@ else()
       SRCS cncl_context_test.cc
       DEPS cncl_context)
   endif()
-endif(WIN32)
+endif()
 
 cc_test(
   test_gradient_accmulator
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 109cb5d8fe07d..a29e530b2b80c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -37,6 +37,7 @@ endif()
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
+get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(utils_modules stringpiece pretty_log string_helper benchmark)
 
 add_subdirectory(api)
@@ -50,7 +51,7 @@ set(STATIC_INFERENCE_API
     reset_tensor_array
     analysis_config
     paddle_pass_builder
-    activation_functions
+    phi
     ${mkldnn_quantizer_cfg})
 
 #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy
@@ -59,7 +60,7 @@ if(WIN32 AND WITH_GPU)
                                    ${utils_modules})
 else()
   create_static_lib(paddle_inference ${fluid_modules} ${phi_modules}
-                    ${STATIC_INFERENCE_API} ${utils_modules})
+                    ${phi_kernels} ${STATIC_INFERENCE_API} ${utils_modules})
 endif()
 
 if(NOT APPLE)
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 7f30b80224e0d..0645af611b9d2 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1960,6 +1960,10 @@ USE_TRT_CONVERTER(strided_slice)
 USE_TRT_CONVERTER(transformer_input_convert)
 USE_TRT_CONVERTER(recover_padding)
 USE_TRT_CONVERTER(remove_padding)
+#if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
+USE_TRT_CONVERTER(sparse_fc)
+USE_TRT_CONVERTER(sparse_multihead_matmul)
+#endif
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 9e5b76db4ac16..96129018d0159 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -115,8 +115,10 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "remove_padding_recover_padding_pass",         //
       "delete_remove_padding_recover_padding_pass",  //
       // "yolo_box_fuse_pass",      //
-      "tensorrt_subgraph_pass",  //
-      "conv_bn_fuse_pass",       //
+      "dense_fc_to_sparse_pass",                //
+      "dense_multihead_matmul_to_sparse_pass",  //
+      "tensorrt_subgraph_pass",                 //
+      "conv_bn_fuse_pass",                      //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map
index 05935701635d9..f8d7fb582b826 100644
--- a/paddle/fluid/inference/paddle_inference.map
+++ b/paddle/fluid/inference/paddle_inference.map
@@ -3,6 +3,7 @@
 		*paddle*;
 		*Pass*;
 		*profile*;
+		*phi*;
 	local:
 		*;
 };
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index abd00ef9de67e..0f1350459ef22 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,4 +1,5 @@
-# Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows. Temporarily add paddle_inference_api dependency to solve the problem
+# Compiling with WITH_PYTHON=ON and WITH_TENSORRT=ON failed on windows.
+# Temporarily add paddle_inference_api dependency to solve the problem
 if(WIN32)
   nv_library(
     tensorrt_engine
@@ -21,7 +22,7 @@ nv_test(
   DEPS dynload_cuda device_context dynamic_loader)
 nv_test(
   test_tensorrt_engine
-  SRCS test_engine.cc
-  DEPS dynload_cuda tensorrt_engine)
+  SRCS test_engine.cc test_dynamic_engine.cc
+  DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
 add_subdirectory(plugin)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index b27a584de2bfa..2c9ba42821535 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,65 +1,74 @@
 # Add TRT tests
+list(
+  APPEND
+  CONVERT_FILES
+  matmul_op.cc
+  conv2d_op.cc
+  fc_op.cc
+  pool2d_op.cc
+  elementwise_op.cc
+  batch_norm_op.cc
+  activation_op.cc
+  unary_op.cc
+  softmax_op.cc
+  concat_op.cc
+  dropout_op.cc
+  group_norm_op.cc
+  pad_op.cc
+  split_op.cc
+  prelu_op.cc
+  leaky_relu_op.cc
+  gelu_op.cc
+  layer_norm_op.cc
+  multihead_matmul_op.cc
+  shuffle_channel_op.cc
+  swish_op.cc
+  instance_norm_op.cc
+  stack_op.cc
+  transpose_op.cc
+  flatten_op.cc
+  flatten_contiguous_range_op.cc
+  emb_eltwise_layernorm.cc
+  skip_layernorm.cc
+  scale_op.cc
+  slice_op.cc
+  hard_sigmoid_op.cc
+  hard_swish_op.cc
+  clip_op.cc
+  gather_op.cc
+  anchor_generator_op.cc
+  yolo_box_op.cc
+  yolo_box_head_op.cc
+  arg_max_op.cc
+  roi_align_op.cc
+  affine_channel_op.cc
+  multiclass_nms_op.cc
+  multiclass_nms3_op.cc
+  nearest_interp_op.cc
+  reshape_op.cc
+  reduce_op.cc
+  gather_nd_op.cc
+  tile_op.cc
+  conv3d_op.cc
+  mish_op.cc
+  nearest_interp_v2_op.cc
+  pool3d_op.cc
+  deformable_conv_op.cc
+  preln_emb_eltwise_layernorm.cc
+  strided_slice_op.cc
+  preln_skip_layernorm.cc
+  roll_op.cc
+  transformer_input_convert_op.cc
+  remove_padding_op.cc
+  recover_padding_op.cc)
+
+if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
+  list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)
+endif()
+
 nv_library(
   tensorrt_converter
-  SRCS matmul_op.cc
-       conv2d_op.cc
-       fc_op.cc
-       pool2d_op.cc
-       elementwise_op.cc
-       batch_norm_op.cc
-       activation_op.cc
-       unary_op.cc
-       softmax_op.cc
-       concat_op.cc
-       dropout_op.cc
-       group_norm_op.cc
-       pad_op.cc
-       split_op.cc
-       prelu_op.cc
-       leaky_relu_op.cc
-       gelu_op.cc
-       layer_norm_op.cc
-       multihead_matmul_op.cc
-       shuffle_channel_op.cc
-       swish_op.cc
-       instance_norm_op.cc
-       stack_op.cc
-       transpose_op.cc
-       flatten_op.cc
-       flatten_contiguous_range_op.cc
-       emb_eltwise_layernorm.cc
-       skip_layernorm.cc
-       scale_op.cc
-       slice_op.cc
-       hard_sigmoid_op.cc
-       hard_swish_op.cc
-       clip_op.cc
-       gather_op.cc
-       anchor_generator_op.cc
-       yolo_box_op.cc
-       yolo_box_head_op.cc
-       arg_max_op.cc
-       roi_align_op.cc
-       affine_channel_op.cc
-       multiclass_nms_op.cc
-       multiclass_nms3_op.cc
-       nearest_interp_op.cc
-       reshape_op.cc
-       reduce_op.cc
-       gather_nd_op.cc
-       tile_op.cc
-       conv3d_op.cc
-       mish_op.cc
-       nearest_interp_v2_op.cc
-       pool3d_op.cc
-       deformable_conv_op.cc
-       preln_emb_eltwise_layernorm.cc
-       strided_slice_op.cc
-       preln_skip_layernorm.cc
-       roll_op.cc
-       transformer_input_convert_op.cc
-       remove_padding_op.cc
-       recover_padding_op.cc
+  SRCS ${CONVERT_FILES}
   DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto
        op_registry)
 
diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc
new file mode 100644
index 0000000000000..de9fd62300ff8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc
@@ -0,0 +1,371 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * FC converter convert a sparse_fc op to a sparse_fc plugin in TRT.
+ */
+class SparseFcOpConverter : public OpConverter {
+ public:
+  nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc,
+                                      nvinfer1::Dims x_dim, int x_num_col_dims,
+                                      std::string output_name) {
+    // add shuffle before fc
+    nvinfer1::Dims reshape_before_fc_dim;
+    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
+    // padding shape "* x q x 1 x 1"
+    for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
+      reshape_before_fc_dim.d[i] = 1;
+    }
+    for (int i = 0; i < x_dim.nbDims; i++) {
+      if (i < x_num_col_dims) {
+        reshape_before_fc_dim.d[i] = 0;
+      } else {
+        if (x_dim.d[i] < 0) {
+          reshape_before_fc_dim.d[x_num_col_dims] = -1;
+          break;
+        }
+        reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
+      }
+    }
+    auto* reshape_before_fc_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc);
+    reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+    reshape_before_fc_layer->setName(
+        ("sparse_fc_op_reshape_before_fc: Shuffle (Output: " + output_name +
+         ")")
+            .c_str());
+    return reshape_before_fc_layer;
+  }
+
+  nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc,
+                                     nvinfer1::Dims x_dim, int x_num_col_dims) {
+    // add shuffle after fc
+    nvinfer1::Dims reshape_after_fc_dim;
+    reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+    for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
+      reshape_after_fc_dim.d[i] = 0;
+    }
+    auto* reshape_after_fc_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc);
+    reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+    return reshape_after_fc_layer;
+  }
+
+  plugin::SpmmPluginDynamic* new_spmm_plugin(TensorRTEngine::Weight* weight,
+                                             TensorRTEngine::Weight* bias,
+                                             const std::string& activation_type,
+                                             nvinfer1::DataType type,
+                                             int outdim) {
+    plugin::SpmmPluginDynamic::Activation act =
+        plugin::SpmmPluginDynamic::Activation::kNone;
+    if (activation_type == "relu") {
+      act = plugin::SpmmPluginDynamic::Activation::kRelu;
+    } else if (activation_type == "gelu") {
+      act = plugin::SpmmPluginDynamic::Activation::kGelu;
+    } else if (activation_type != "") {
+      PADDLE_THROW(paddle::platform::errors::Fatal("unknown activation_type %s",
+                                                   activation_type.c_str()));
+    }
+    return new plugin::SpmmPluginDynamic("CustomSpmmPluginDynamic", type,
+                                         outdim, weight->get(), bias->get(),
+                                         act);
+  }
+
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a sparse_fc op to tensorrt sparse_fc plugin";
+    framework::OpDesc op_desc(op, nullptr);
+    auto output_name = op_desc.Output("Out").front();
+    auto input_names = op_desc.InputNames();
+    bool with_bias = input_names.size() >= 3;
+    std::string w_name = "Y";
+    std::string i_name = "X";
+    if (with_bias) {
+      w_name = "W";
+      i_name = "Input";
+    }
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
+    auto x_dim = X->getDimensions();
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
+    PADDLE_ENFORCE_NOT_NULL(
+        Y_v,
+        platform::errors::NotFound(
+            "Can not find %s presistale var of sparse_fc in scope.", w_name));
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    int x_num_col_dims =
+        op_desc.HasAttr("x_num_col_dims")
+            ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
+            : (op_desc.HasAttr("in_num_col_dims")
+                   ? BOOST_GET_CONST(int, op_desc.GetAttr("in_num_col_dims"))
+                   : 1);
+    const std::string activation_type =
+        op_desc.HasAttr("activation_type")
+            ? BOOST_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
+            : "";
+    float* weight_data = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    bool support_int8 = false;
+    if (op_desc.HasAttr("support_int8")) {
+      support_int8 = BOOST_GET_CONST(bool, op_desc.GetAttr("support_int8"));
+    }
+    float in_scale = 0;
+    if (enable_int8 || support_int8) {
+      if (enable_int8) {
+        in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale"));
+      } else {
+        // attr X is generated by add_support_int8_pass
+        in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("X"));
+      }
+      engine_->SetTensorDynamicRange(X, in_scale);
+    }
+    weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t);
+
+    PADDLE_ENFORCE_EQ(
+        Y_t->dims().size(), 2UL,
+        platform::errors::InvalidArgument(
+            "The sparse_fc's weight should be a matrix with 2 dims, but "
+            "it's %d-dimensional.",
+            Y_t->dims().size()));  // a matrix
+    int m = Y_t->dims()[0];
+    int n = Y_t->dims()[1];
+    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
+      for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+          dst[j * m + i] = src[i * n + j];
+        }
+      }
+    };
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
+                         TensorRTEngine::Weight& weight,
+                         TensorRTEngine::Weight& bias) {
+      if (enable_int8 || support_int8) {
+        // add conv1x1 layer
+        nvinfer1::DimsHW nv_ksize(1, 1);
+        auto* fc_layer_int8 =
+            TRT_ENGINE_ADD_LAYER(engine_, Convolution, *X, n_output, nv_ksize,
+                                 weight.get(), bias.get());
+        if (activation_type == "relu") {
+          fc_layer_int8->setName(
+              ("ernie_fc_op_int8: Convolution (Output: " + output_name + ")")
+                  .c_str());
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in fc layers in int8 mode"));
+          float out_scale = 0;
+          if (enable_int8) {
+            out_scale =
+                BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+          } else {
+            out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Out"));
+          }
+          engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0),
+                                         out_scale);
+          nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(fc_layer_int8->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer_int8, "relu_after_ernie_fc_int8",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(fc_layer_int8,
+                                   "ernie_fc_op_int8: Convolution",
+                                   {output_name}, test_mode);
+        }
+      } else {
+        // add fc layer
+        auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *X, n_output, weight.get(), bias.get());
+        if (activation_type == "relu") {
+          fc_layer_float->setName(
+              ("ernie_fc_op_float: (Output: " + output_name + ")").c_str());
+          nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(fc_layer_float->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer_float,
+                                   "relu_after_ernie_fc_float", {output_name},
+                                   test_mode);
+        } else {
+          RreplenishLayerAndOutput(fc_layer_float, "ernie_fc_op_float",
+                                   {output_name}, test_mode);
+        }
+      }
+    };
+    auto regist_sparse_fc = [&](nvinfer1::ITensor* inputs, int n_output,
+                                TensorRTEngine::Weight* weight,
+                                TensorRTEngine::Weight* bias) {
+      if (enable_int8 || support_int8) {
+        // add conv layer
+        float out_scale = 0;
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in sparse_fc layers in int8 mode"));
+          out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        } else {
+          out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Out"));
+        }
+        plugin::SpmmPluginDynamic* plugin = new_spmm_plugin(
+            weight, bias, activation_type, nvinfer1::DataType::kINT8, n);
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        plugin_inputs.emplace_back(inputs);
+        auto fc_layer_int8 = engine_->network()->addPluginV2(
+            plugin_inputs.data(), plugin_inputs.size(), *plugin);
+        fc_layer_int8->setName(
+            ("sparse_fc_op_int8: (Output: " + output_name + ")").c_str());
+        engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
+        auto* fc_after_reshape_int8 = reshape_after_fc(
+            fc_layer_int8->getOutput(0), x_dim, x_num_col_dims);
+
+        RreplenishLayerAndOutput(fc_after_reshape_int8,
+                                 "sparse_fc_op_int8_reshape_after_fc: Shuffle",
+                                 {output_name}, test_mode);
+      } else {
+        plugin::SpmmPluginDynamic* plugin = new_spmm_plugin(
+            weight, bias, activation_type,
+            with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+            n);
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        plugin_inputs.emplace_back(inputs);
+        auto fc_layer_float = engine_->network()->addPluginV2(
+            plugin_inputs.data(), plugin_inputs.size(), *plugin);
+        fc_layer_float->setName(
+            ("sparse_fc_op_float: FullyConnected (Output: " + output_name + ")")
+                .c_str());
+        auto* fc_after_reshape_float = reshape_after_fc(
+            fc_layer_float->getOutput(0), x_dim, x_num_col_dims);
+
+        RreplenishLayerAndOutput(fc_after_reshape_float,
+                                 "shuffle_after_sparse_fc", {output_name},
+                                 test_mode);
+      }
+    };
+
+    bool transpose_y = false;
+    if (op_desc.HasAttr("transpose_Y")) {
+      transpose_y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
+    }
+    int weight_w, weight_h;
+    if (!transpose_y) {
+      std::vector<float> weight_data_tmp;
+      weight_data_tmp.reserve(Y_t->numel());
+      memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float));
+      tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
+      weight_w = n;
+      weight_h = m;
+    } else {
+      weight_w = m;
+      weight_h = n;
+    }
+    size_t n_output = weight_w;
+    float* bias_data = nullptr;
+    int bias_num = 0;
+    if (with_bias) {
+      auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
+      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
+      bias_data = engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t);
+      bias_num = b_t->numel();
+    }
+    // Running the TRT Static Shape mode: x_num_col_dims-1
+    if (!engine_->with_dynamic_shape()) {
+      x_num_col_dims--;
+    }
+    // If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can
+    // not add Shuffle layer in ernie's multihead.
+    // Sparse inference doesn't support variable length for now.
+    if (x_dim.nbDims == 4 && x_num_col_dims == 1) {
+      TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                    static_cast<void*>(weight_data),
+                                    static_cast<size_t>(Y_t->numel())};
+      weight.dims.assign({weight_w, weight_h});
+      TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(bias_data),
+                                  static_cast<size_t>(bias_num)};
+      regist_fc(X, n_output, weight, bias);
+    } else {  // need reshape input before and after fc
+      PADDLE_ENFORCE_GT(
+          x_dim.nbDims, x_num_col_dims,
+          platform::errors::InvalidArgument(
+              "Params and input dims mismatch. Paddle-TRT FC "
+              "converter expects x_dim.nbDims > x_num_col_dims, but "
+              "x_dim.nbDims : %d, x_num_col_dims : %d.",
+              x_dim.nbDims, x_num_col_dims));
+      half* half_data = nullptr;
+      void* w_data = nullptr;
+      if (with_fp16) {
+        half_data = new half[Y_t->numel()];
+        for (int i = 0; i < Y_t->numel(); i++) {
+          half_data[i] = static_cast<half>(weight_data[i]);
+        }
+        w_data = static_cast<void*>(half_data);
+      } else {
+        w_data = static_cast<void*>(weight_data);
+      }
+      TensorRTEngine::Weight weight{
+          with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+          w_data, static_cast<size_t>(Y_t->numel())};
+      weight.dims.assign({weight_w, weight_h});
+      void* b_data = nullptr;
+      if (with_bias) {
+        half* half_bias_data = nullptr;
+        if (with_fp16) {
+          half_bias_data = new half[bias_num];
+          for (int i = 0; i < bias_num; i++) {
+            half_bias_data[i] = static_cast<half>(bias_data[i]);
+          }
+          b_data = static_cast<void*>(half_bias_data);
+        } else {
+          b_data = static_cast<void*>(bias_data);
+        }
+      }
+      TensorRTEngine::Weight bias{
+          with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+          b_data, static_cast<size_t>(bias_num)};
+
+      auto* reshape_before_fc_layer =
+          reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
+      auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
+      if (enable_int8 || support_int8) {
+        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
+      }
+      regist_sparse_fc(reshape_itensor, n_output, &weight, &bias);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(sparse_fc, SparseFcOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
new file mode 100644
index 0000000000000..3de8fad0206d7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc
@@ -0,0 +1,441 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class SparseMultiheadMatMulOpConverter : public OpConverter {
+ public:
+  plugin::SpmmPluginDynamic* new_spmm_plugin(TensorRTEngine::Weight* weight,
+                                             TensorRTEngine::Weight* bias,
+                                             nvinfer1::DataType type,
+                                             int outdim) {
+    plugin::SpmmPluginDynamic::Activation act =
+        plugin::SpmmPluginDynamic::Activation::kNone;
+    return new plugin::SpmmPluginDynamic("CustomSpmmPluginDynamic", type,
+                                         outdim, weight->get(), bias->get(),
+                                         act);
+  }
+
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid sparse_multihead_matmul op to a corresponding "
+               "tensorrt "
+               "network structure";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("Input").front());
+
+    // fc weights and fc bias
+    auto weight_name = op_desc.Input("W").front();
+    auto bias_name = op_desc.Input("Bias").front();
+
+    auto* weight_v = scope.FindVar(weight_name);
+    auto* weight_t = weight_v->GetMutable<framework::LoDTensor>();
+
+    auto* bias_v = scope.FindVar(bias_name);
+    auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
+
+    float* weight_data = nullptr;
+    bool qkv2context_plugin_int8 = op_desc.HasAttr("qkv2context_plugin_int8");
+    float in_scale = 0.;
+
+    if (op_desc.HasAttr("Input_scale")) {
+      in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale"));
+      engine_->SetTensorDynamicRange(input, in_scale);
+    }
+    weight_data = engine_->GetWeightCPUData(weight_name, weight_t);
+
+    float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t);
+    std::vector<float> weight_data_tmp;
+    weight_data_tmp.reserve(weight_t->numel());
+    memcpy(weight_data_tmp.data(), weight_data,
+           weight_t->numel() * sizeof(float));
+
+    // (hidden_in, 3, hidden_out)
+    const auto& weight_dims = weight_t->dims();
+
+    int hidden_in = weight_dims[0];   // channels_in
+    int three = weight_dims[1];       // channels_out
+    int hidden_out = weight_dims[2];  // channels_out
+    int m = hidden_in;
+    int n = three * hidden_out;
+    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
+      for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+          dst[j * m + i] = src[i * n + j];
+        }
+      }
+    };
+    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
+
+    int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number"));
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+
+    nvinfer1::ILayer* layer = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
+    bool flag_varseqlen = engine_->use_varseqlen() &&
+                          engine_->tensorrt_transformer_posid() != "" &&
+                          engine_->tensorrt_transformer_maskid() != "";
+    if (engine_->with_dynamic_shape()) {
+      if (flag_varseqlen) {
+        if (engine_->precision() == AnalysisConfig::Precision::kFloat32) {
+          PADDLE_THROW(platform::errors::Fatal(
+              "use use_varseqlen must be int8 or half, not float32."));
+        }
+        nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
+                                 static_cast<void*>(weight_data),
+                                 static_cast<int32_t>(weight_t->numel())};
+        nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
+                               static_cast<void*>(bias_data),
+                               static_cast<int32_t>(bias_t->numel())};
+        if (engine_->with_interleaved()) {
+          VLOG(4) << "fused multihead_matmul op: use_varseqlen and "
+                     "with_interleaved";
+          if (!op_desc.HasAttr("Input_scale")) {
+            PADDLE_THROW(
+                platform::errors::Fatal("use with_interleaved must be int8."));
+          }
+          nvinfer1::ILayer* fc_layer = nullptr;
+          float dp_probs = 1.0 / 127.0;
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n,
+                                          nv_ksize, weight, bias);
+          fc_layer->setName(
+              ("Multihead: Convolution/FullyConnected: (Output: " +
+               output_name + ")")
+                  .c_str());
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("fc_out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out_threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          if (qkv2context_plugin_int8) {
+            dp_probs =
+                BOOST_GET_CONST(float, op_desc.GetAttr("dp_probs")) / 127.0;
+          }
+          auto creator = GetPluginRegistry()->getPluginCreator(
+              "CustomQKVToContextPluginDynamic", "3");
+          assert(creator != nullptr);
+          std::vector<nvinfer1::PluginField> fields{
+              {"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32,
+               1},
+              {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32,
+               1}};
+          if (qkv2context_plugin_int8) {
+            fields.push_back({"dq_probs", &dp_probs,
+                              nvinfer1::PluginFieldType::kFLOAT32, 1});
+          }
+          nvinfer1::PluginFieldCollection* plugin_collection =
+              static_cast<nvinfer1::PluginFieldCollection*>(malloc(
+                  sizeof(*plugin_collection) +
+                  fields.size() *
+                      sizeof(nvinfer1::PluginField)));  // remember to free
+          plugin_collection->nbFields = static_cast<int>(fields.size());
+          plugin_collection->fields = fields.data();
+
+          auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
+                                              plugin_collection);
+          free(plugin_collection);
+
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          plugin_inputs.emplace_back(fc_layer->getOutput(0));
+          if (engine_->Has("ernie_pos_name")) {
+            plugin_inputs.emplace_back(engine_->GetITensor(
+                engine_->Get<std::string>("ernie_pos_name")));
+          } else {
+            plugin_inputs.emplace_back(engine_->GetITensor(
+                engine_->network()
+                    ->getInput(2)
+                    ->getName()));  // cu_seqlens, eval_placeholder_2
+          }
+          auto max_seqlen_tensor =
+              engine_->GetITensor(engine_->network()->getInput(3)->getName());
+          engine_->SetTensorDynamicRange(max_seqlen_tensor, 1.0f);
+          auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Shuffle,
+              *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
+          nvinfer1::Dims shape_dim;
+          shape_dim.nbDims = 1;
+          shape_dim.d[0] = -1;
+          shuffle_layer->setReshapeDimensions(shape_dim);
+          engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+          plugin_inputs.emplace_back(
+              shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+          shuffle_layer->setName(
+              ("Multihead: Shuffle: (Output: " + output_name + ")").c_str());
+          auto plugin_layer = engine_->network()->addPluginV2(
+              plugin_inputs.data(), plugin_inputs.size(), *plugin);
+          layer = plugin_layer;
+        } else {
+          int head_size = hidden_out / head_number;
+          // [3, head_number, head_size, hidden_in] -> [head_number, 3,
+          // head_size,
+          // hidden_in]
+          auto transpose_weight_v2 = [](const float* src, float* dst, int three,
+                                        int head_number, int head_size,
+                                        int hidden_in) {
+            const int HH = head_size * hidden_in;
+            for (int i = 0; i < three; ++i) {
+              for (int n = 0; n < head_number; ++n) {
+                for (int hh = 0; hh < HH; ++hh) {
+                  dst[n * three * HH + i * HH + hh] =
+                      src[i * head_number * HH + n * HH + hh];
+                }
+              }
+            }
+          };
+          // [3, head_number, head_size] -> [head_number, 3, head_size]
+          auto transpose_bias_v2 = [](const float* src, float* dst, int N,
+                                      int H) {
+            for (int i = 0; i < 3; ++i) {
+              for (int n = 0; n < N; ++n) {
+                for (int h = 0; h < H; ++h) {
+                  dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
+                }
+              }
+            }
+          };
+          memcpy(weight_data_tmp.data(), weight_data,
+                 weight_t->numel() * sizeof(float));
+          transpose_weight_v2(weight_data_tmp.data(), weight_data, three,
+                              head_number, head_size, hidden_in);
+
+          std::vector<float> bias_data_tmp;
+          bias_data_tmp.reserve(bias_t->numel());
+          memcpy(bias_data_tmp.data(), bias_data,
+                 bias_t->numel() * sizeof(float));
+          transpose_bias_v2(bias_data_tmp.data(), bias_data, head_number,
+                            head_size);
+
+          nvinfer1::ILayer* fc_layer = nullptr;
+          float dp_probs = 1.0 / 127.0;
+          if (op_desc.HasAttr("Input_scale")) {
+            nvinfer1::DimsHW nv_ksize(1, 1);
+            fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n,
+                                            nv_ksize, weight, bias);
+          } else {
+            fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                            weight, bias);
+          }
+
+          if (op_desc.HasAttr("fc_out_threshold")) {
+            PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"), true,
+                              platform::errors::InvalidArgument(
+                                  "must have out threshold in multihead layers "
+                                  "in int8 mode"));
+            float out_scale =
+                BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+            engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+            if (qkv2context_plugin_int8) {
+              dp_probs =
+                  BOOST_GET_CONST(float, op_desc.GetAttr("dp_probs")) / 127.0;
+            }
+          }
+          auto creator = GetPluginRegistry()->getPluginCreator(
+              "CustomQKVToContextPluginDynamic", "2");
+          assert(creator != nullptr);
+          int type = static_cast<int>(nvinfer1::DataType::kHALF);
+          if (qkv2context_plugin_int8 &&
+              (engine_->precision() == AnalysisConfig::Precision::kInt8)) {
+            type = static_cast<int>(nvinfer1::DataType::kINT8);
+          }
+          bool has_mask = true;
+          int var_seqlen = 1;
+          std::vector<nvinfer1::PluginField> fields{
+              {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+              {"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32,
+               1},
+              {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
+              {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
+              {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32,
+               1}};
+          if (qkv2context_plugin_int8) {
+            fields.push_back({"dq_probs", &dp_probs,
+                              nvinfer1::PluginFieldType::kFLOAT32, 1});
+          }
+          nvinfer1::PluginFieldCollection* plugin_collection =
+              static_cast<nvinfer1::PluginFieldCollection*>(malloc(
+                  sizeof(*plugin_collection) +
+                  fields.size() *
+                      sizeof(nvinfer1::PluginField)));  // remember to free
+          plugin_collection->nbFields = static_cast<int>(fields.size());
+          plugin_collection->fields = fields.data();
+
+          auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
+                                              plugin_collection);
+          free(plugin_collection);
+
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          plugin_inputs.emplace_back(fc_layer->getOutput(0));
+          plugin_inputs.emplace_back(engine_->GetITensor("qkv_plugin_mask"));
+          plugin_inputs.emplace_back(engine_->GetITensor("pos_id"));
+
+          auto max_seqlen_tensor = engine_->GetITensor("mask_id");
+          auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Shuffle,
+              *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
+          nvinfer1::Dims shape_dim;
+          shape_dim.nbDims = 1;
+          shape_dim.d[0] = -1;
+          shuffle_layer->setReshapeDimensions(shape_dim);
+          engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
+          plugin_inputs.emplace_back(
+              shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+          auto plugin_layer = engine_->network()->addPluginV2(
+              plugin_inputs.data(), plugin_inputs.size(), *plugin);
+          layer = plugin_layer;
+        }
+      } else {
+        PADDLE_ENFORCE_EQ(
+            input->getDimensions().nbDims, 3,
+            platform::errors::InvalidArgument(
+                "The Input dim of the SparseMultiheadMatMul should be 3, "
+                "but it's (%d) now.",
+                input->getDimensions().nbDims));
+        // transpose weight_data from m * n to  n * m
+        auto* input_bias_qk =
+            engine_->GetITensor(op_desc.Input("BiasQK").front());
+
+        half* half_data = nullptr;
+        void* w_data = nullptr;
+        if (with_fp16) {
+          half_data = new half[weight_t->numel()];
+          for (int i = 0; i < weight_t->numel(); i++) {
+            half_data[i] = static_cast<half>(weight_data[i]);
+          }
+          w_data = static_cast<void*>(half_data);
+        } else {
+          w_data = static_cast<void*>(weight_data);
+        }
+
+        TensorRTEngine::Weight weight{
+            with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+            static_cast<void*>(w_data), static_cast<size_t>(weight_t->numel())};
+        weight.dims.assign({n, m});
+
+        half* half_bias_data = nullptr;
+        void* b_data = nullptr;
+        if (with_fp16) {
+          half_bias_data = new half[bias_t->numel()];
+          for (int i = 0; i < bias_t->numel(); i++) {
+            half_bias_data[i] = static_cast<half>(bias_data[i]);
+          }
+          b_data = static_cast<void*>(half_bias_data);
+        } else {
+          b_data = static_cast<void*>(bias_data);
+        }
+
+        TensorRTEngine::Weight bias{
+            with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+            b_data, static_cast<size_t>(bias_t->numel())};
+
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = 5;
+        reshape_before_fc_dim.d[0] = 0;
+        reshape_before_fc_dim.d[1] = 0;
+        reshape_before_fc_dim.d[2] = 0;
+        reshape_before_fc_dim.d[3] = 1;
+        reshape_before_fc_dim.d[4] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        if (op_desc.HasAttr("Input_scale")) {
+          engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
+                                         in_scale);
+        }
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_sparse_multihead_mamul(Output: " + output_name +
+             ")")
+                .c_str());
+
+        // add layer fc
+        nvinfer1::ILayer* fc_layer = nullptr;
+        if (op_desc.HasAttr("Input_scale")) {
+          plugin::SpmmPluginDynamic* plugin =
+              new_spmm_plugin(&weight, &bias, nvinfer1::DataType::kINT8, n);
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          plugin_inputs.emplace_back(reshape_before_fc_layer->getOutput(0));
+          fc_layer = engine_->network()->addPluginV2(
+              plugin_inputs.data(), plugin_inputs.size(), *plugin);
+        } else {
+          plugin::SpmmPluginDynamic* plugin =
+              new_spmm_plugin(&weight, &bias,
+                              with_fp16 ? nvinfer1::DataType::kHALF
+                                        : nvinfer1::DataType::kFLOAT,
+                              n);
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          plugin_inputs.emplace_back(reshape_before_fc_layer->getOutput(0));
+          fc_layer = engine_->network()->addPluginV2(
+              plugin_inputs.data(), plugin_inputs.size(), *plugin);
+        }
+
+        if (op_desc.HasAttr("fc_out_threshold")) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("fc_out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+        }
+        fc_layer->setName(
+            ("sparse_multihead_mamul_fc(Output: " + output_name + ")").c_str());
+
+        // no need to add shuffle after fc, just change it in
+        // QkvToContextPluginDynamic
+
+        // add qkv to context
+        int head_size = hidden_out / head_number;
+        float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
+
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        plugin_inputs.push_back(fc_layer->getOutput(0));
+        plugin_inputs.push_back(input_bias_qk);
+
+        if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
+          with_fp16 = true;
+        }
+        plugin::DynamicPluginTensorRT* plugin =
+            new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
+                                                  head_size, scale, with_fp16);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the Ernie(Bert) model in static shape mode, which "
+          "is not supported for the time being.\n"
+          "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
+          "the shape information to run the dynamic shape mode."));
+    }
+    RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(sparse_multihead_matmul,
+                          SparseMultiheadMatMulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index dc7c77bc66acf..57ac400dadab3 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -46,6 +46,12 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("reshape2");
     int8_teller_set.insert("reshape");
     int8_teller_set.insert("reshape2");
+#endif
+#if IS_TRT_VERSION_GE(8000)
+    teller_set.insert("sparse_fc");
+    int8_teller_set.insert("sparse_fc");
+    teller_set.insert("sparse_multihead_matmul");
+    int8_teller_set.insert("sparse_multihead_matmul");
 #endif
   }
 
@@ -1753,6 +1759,16 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+#if IS_TRT_VERSION_GE(8000)
+    if (op_type == "sparse_fc" || op_type == "sparse_multihead_matmul") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the sparse_fc and sparse_multihead_matmul does not support "
+                   "static shape yet";
+        return false;
+      }
+    }
+#endif
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 0377c82838bdd..5ee70ee824101 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,32 +1,41 @@
+list(
+  APPEND
+  TRT_FILES
+  trt_plugin.cc
+  split_op_plugin.cu
+  elementwise_op_plugin.cu
+  prelu_op_plugin.cu
+  gelu_op_plugin.cu
+  pool_op_plugin.cu
+  swish_op_plugin.cu
+  layer_norm_op_plugin.cu
+  instance_norm_op_plugin.cu
+  emb_eltwise_layernorm_plugin.cu
+  qkv_to_context_plugin.cu
+  skip_layernorm_op_plugin.cu
+  slice_op_plugin.cu
+  hard_swish_op_plugin.cu
+  stack_op_plugin.cu
+  anchor_generator_op_plugin.cu
+  yolo_box_op_plugin.cu
+  yolo_box_head_op_plugin.cu
+  roi_align_op_plugin.cu
+  gather_nd_op_plugin.cu
+  mish_op_plugin.cu
+  pool3d_op_plugin.cu
+  deformable_conv_op_plugin.cu
+  matmul_op_int8_plugin.cu
+  transformer_input_convert_plugin.cu
+  remove_padding_plugin.cu
+  recover_padding_plugin.cu)
+
+if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
+  list(APPEND TRT_FILES spmm_plugin.cu)
+endif()
+
 nv_library(
   tensorrt_plugin
-  SRCS trt_plugin.cc
-       split_op_plugin.cu
-       elementwise_op_plugin.cu
-       prelu_op_plugin.cu
-       gelu_op_plugin.cu
-       pool_op_plugin.cu
-       swish_op_plugin.cu
-       layer_norm_op_plugin.cu
-       instance_norm_op_plugin.cu
-       emb_eltwise_layernorm_plugin.cu
-       qkv_to_context_plugin.cu
-       skip_layernorm_op_plugin.cu
-       slice_op_plugin.cu
-       hard_swish_op_plugin.cu
-       stack_op_plugin.cu
-       anchor_generator_op_plugin.cu
-       yolo_box_op_plugin.cu
-       yolo_box_head_op_plugin.cu
-       roi_align_op_plugin.cu
-       gather_nd_op_plugin.cu
-       mish_op_plugin.cu
-       pool3d_op_plugin.cu
-       deformable_conv_op_plugin.cu
-       matmul_op_int8_plugin.cu
-       transformer_input_convert_plugin.cu
-       remove_padding_plugin.cu
-       recover_padding_plugin.cu
+  SRCS ${TRT_FILES}
   DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(
diff --git a/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu
new file mode 100644
index 0000000000000..4058d6564fc8d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.cu
@@ -0,0 +1,923 @@
+/*
+Copyright (c) 2022, PaddlePaddle Authors, NVIDIA CORPORATION. All Rights
+Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+nvinfer1::PluginFieldCollection SpmmPluginDynamicCreator::field_collection_{};
+std::vector<nvinfer1::PluginField> SpmmPluginDynamicCreator::plugin_attr_;
+
+inline int getElementSize(nvinfer1::DataType type) {
+  switch (type) {
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    case nvinfer1::DataType::kINT8:
+      return 1;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "getElementSize only supports [FLOAT|HALF|INT8]"));
+  }
+}
+
+inline cudaDataType_t convertTrtType(nvinfer1::DataType type) {
+  switch (type) {
+    case nvinfer1::DataType::kFLOAT:
+      return CUDA_R_32F;
+    case nvinfer1::DataType::kHALF:
+      return CUDA_R_16F;
+    case nvinfer1::DataType::kINT8:
+      return CUDA_R_8I;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "getElementSize only supports [FLOAT|HALF|INT8]"));
+  }
+}
+
+inline void deserialize_value_size(void const** buffer, size_t* buffer_size,
+                                   void* value, size_t value_size) {
+  PADDLE_ENFORCE_GE(
+      *buffer_size, value_size,
+      platform::errors::InvalidArgument("buffer_size must >= value_size"));
+  memcpy(value, *buffer, value_size);
+  reinterpret_cast<char const*&>(*buffer) += value_size;
+  *buffer_size -= value_size;
+}
+
+inline float round_scale(float x) { return std::floor(x + 0.5f); }
+
+inline void cudaFreeFunc(void* p) {
+  if (p) {
+    cudaFree(p);
+  }
+}
+
+inline void convertAndCopy(const nvinfer1::Weights& src,
+                           nvinfer1::DataType type, void* dest) {
+  PADDLE_ENFORCE_EQ(src.type == nvinfer1::DataType::kFLOAT ||
+                        src.type == nvinfer1::DataType::kHALF,
+                    true,
+                    platform::errors::InvalidArgument(
+                        "convertAndCopy only supports src type [FLOAT|HALF]"));
+  PADDLE_ENFORCE_EQ(
+      type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF,
+      true,
+      platform::errors::InvalidArgument(
+          "convertAndCopy only supports src type [FLOAT|HALF]"));
+
+  if (type == nvinfer1::DataType::kFLOAT) {
+    if (src.type == nvinfer1::DataType::kFLOAT) {
+      std::copy_n(static_cast<const float*>(src.values), src.count,
+                  static_cast<float*>(dest));
+    } else {
+      for (int i = 0; i < src.count; ++i) {
+        static_cast<float*>(dest)[i] =
+            static_cast<float>(static_cast<const __half*>(src.values)[i]);
+      }
+    }
+  } else {
+    if (src.type == nvinfer1::DataType::kHALF) {
+      std::copy_n(static_cast<const __half*>(src.values), src.count,
+                  static_cast<__half*>(dest));
+    } else {
+      for (int i = 0; i < src.count; ++i) {
+        static_cast<__half*>(dest)[i] =
+            static_cast<__half>(static_cast<const float*>(src.values)[i]);
+      }
+    }
+  }
+}
+
+SpmmPluginDynamic::cusparseLtContext::cusparseLtContext() {
+  paddle::platform::dynload::cusparseLtInit(&handle);
+}
+
+SpmmPluginDynamic::cusparseLtContext::~cusparseLtContext() {
+  paddle::platform::dynload::cusparseLtDestroy(&handle);
+}
+
+void SpmmPluginDynamic::cusparseLtContext::init(
+    int m, int n, int k, cudaDataType_t type, void* bias_ptr,
+    SpmmPluginDynamic::Activation activation) {
+  /*
+  1. Init matrix descriptors (matA, matB, matC)
+  2. Init matrix multiplication descriptor (matmul)
+  3. Set activation and bias attribute of matmul
+  4. Init algorithm selection descriptor (alg_sel)
+  5. Init plan descriptor (plan)
+  */
+  PADDLE_ENFORCE_EQ(
+      is_initialized, false,
+      platform::errors::InvalidArgument(
+          "Descriptor should be destroyed before calling create"));
+  constexpr int alignment = 16;
+  cusparseComputeType compute_type;
+  switch (type) {
+    case CUDA_R_32F:
+      compute_type = CUSPARSE_COMPUTE_TF32;
+      break;
+    case CUDA_R_16F:
+      compute_type = CUSPARSE_COMPUTE_16F;
+      break;
+    case CUDA_R_8I:
+      compute_type = CUSPARSE_COMPUTE_32I;
+      break;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "cusparLtContext only supports data type"
+          "[CUDA_R_32F|CUDA_R_16F|CUDA_R_8I]"));
+  }
+  paddle::platform::dynload::cusparseLtDenseDescriptorInit(
+      &handle, &matA, m, k, k, alignment, type, CUSPARSE_ORDER_ROW);
+  paddle::platform::dynload::cusparseLtStructuredDescriptorInit(
+      &handle, &matB, n, k, k, alignment, type, CUSPARSE_ORDER_ROW,
+      CUSPARSELT_SPARSITY_50_PERCENT);
+  paddle::platform::dynload::cusparseLtDenseDescriptorInit(
+      &handle, &matC, m, n, n, alignment, type, CUSPARSE_ORDER_ROW);
+  paddle::platform::dynload::cusparseLtMatmulDescriptorInit(
+      &handle, &matmul, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_TRANSPOSE, &matA, &matB, &matC, &matC, compute_type);
+  if (activation == SpmmPluginDynamic::Activation::kRelu) {
+    int true_value = 1;
+    float relu_upper_bound = std::numeric_limits<float>::max();
+    float relu_threshold = 0.0f;
+    paddle::platform::dynload::cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_ACTIVATION_RELU, &true_value,
+        sizeof(true_value));
+    paddle::platform::dynload::cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_ACTIVATION_RELU_UPPERBOUND,
+        &relu_upper_bound, sizeof(relu_upper_bound));
+    paddle::platform::dynload::cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_ACTIVATION_RELU_THRESHOLD,
+        &relu_threshold, sizeof(relu_threshold));
+  } else if (activation == SpmmPluginDynamic::Activation::kGelu) {
+    int true_value = 1;
+    paddle::platform::dynload::cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_ACTIVATION_GELU, &true_value,
+        sizeof(true_value));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        activation, SpmmPluginDynamic::Activation::kNone,
+        platform::errors::InvalidArgument("Received unknown activation"));
+  }
+  if (bias_ptr != nullptr) {
+    paddle::platform::dynload::cusparseLtMatmulDescSetAttribute(
+        &handle, &matmul, CUSPARSELT_MATMUL_BIAS_POINTER, &bias_ptr,
+        sizeof(bias_ptr));
+  }
+  paddle::platform::dynload::cusparseLtMatmulAlgSelectionInit(
+      &handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT);
+  int alg = 0;
+  paddle::platform::dynload::cusparseLtMatmulAlgSetAttribute(
+      &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg));
+  paddle::platform::dynload::cusparseLtMatmulGetWorkspace(&handle, &alg_sel,
+                                                          &workspace_size);
+  paddle::platform::dynload::cusparseLtMatmulPlanInit(&handle, &plan, &matmul,
+                                                      &alg_sel, workspace_size);
+  is_initialized = true;
+}
+
+void SpmmPluginDynamic::cusparseLtContext::setAlgo(int alg) {
+  PADDLE_ENFORCE_EQ(
+      is_initialized, true,
+      platform::errors::InvalidArgument(
+          "Descriptor should be initialized before setting algorithm"));
+  paddle::platform::dynload::cusparseLtMatmulAlgSetAttribute(
+      &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg));
+  paddle::platform::dynload::cusparseLtMatmulGetWorkspace(&handle, &alg_sel,
+                                                          &workspace_size);
+  paddle::platform::dynload::cusparseLtMatmulPlanDestroy(&plan);
+  paddle::platform::dynload::cusparseLtMatmulPlanInit(&handle, &plan, &matmul,
+                                                      &alg_sel, workspace_size);
+}
+
+void SpmmPluginDynamic::cusparseLtContext::destroy() {
+  PADDLE_ENFORCE_EQ(is_initialized, true,
+                    platform::errors::InvalidArgument(
+                        "cusparseLtContext is destroy before init"));
+  paddle::platform::dynload::cusparseLtMatmulPlanDestroy(&plan);
+  paddle::platform::dynload::cusparseLtMatDescriptorDestroy(&matC);
+  paddle::platform::dynload::cusparseLtMatDescriptorDestroy(&matB);
+  paddle::platform::dynload::cusparseLtMatDescriptorDestroy(&matA);
+  is_initialized = false;
+}
+
+void SpmmPluginDynamic::cusparseLtContext::compressMatB(
+    int n, int k, cudaDataType_t type, void* src, void** dest,
+    size_t* compressed_size) {
+  PADDLE_ENFORCE_EQ(
+      is_initialized, false,
+      platform::errors::InvalidArgument(
+          "cusparseLtContext should not initialized before compressMatB"));
+  PADDLE_ENFORCE_EQ(*dest, nullptr,
+                    platform::errors::InvalidArgument(
+                        "before compressMatB *dest must be nullptr"));
+  constexpr int alignment = 16;
+  paddle::platform::dynload::cusparseLtStructuredDescriptorInit(
+      &handle, &matB, n, k, k, alignment, type, CUSPARSE_ORDER_ROW,
+      CUSPARSELT_SPARSITY_50_PERCENT);
+
+  paddle::platform::dynload::cusparseLtSpMMACompressedSize2(&handle, &matB,
+                                                            compressed_size);
+  cudaMalloc(dest, *compressed_size);
+  paddle::platform::dynload::cusparseLtSpMMACompress2(
+      &handle, &matB, 0, CUSPARSE_OPERATION_TRANSPOSE, src, *dest, nullptr);
+  paddle::platform::dynload::cusparseLtMatDescriptorDestroy(&matB);
+}
+
+// Constructor for new plugin
+SpmmPluginDynamic::SpmmPluginDynamic(const std::string& layer_name,
+                                     const nvinfer1::DataType precision,
+                                     const int out_dim,
+                                     const nvinfer1::Weights& weight,
+                                     const nvinfer1::Weights& bias,
+                                     Activation activation)
+    : layer_name_(layer_name),
+      precision_(precision),
+      out_dim_(out_dim),
+      k_(0),
+      m_max_(0),
+      is_configured_(false),
+      optim_alg_(0),
+      weight_scale_(1.0f),
+      weight_compressed_(nullptr),
+      weight_compressed_dev_(nullptr),
+      weight_compressed_dev_global_(nullptr),
+      compressed_size_(0),
+      has_bias_(false),
+      bias_(nullptr),
+      bias_dev_(nullptr),
+      activation_(activation) {
+  /*
+  1. Convert weight precision (on host)
+  2. (Int8) Calculate scale and scale the weight (on host)
+  3. Copy weight to device
+  4. Compress the weight (on device)
+  5. Reset the shared_ptr "weight_compressed_dev_global_" to the compressed
+  weight
+  6. Copy the compressed weight to host
+  7. Convert bias precision and copy (on host)
+  */
+  precision_size_ = getElementSize(precision);
+  element_size_ =
+      (precision_ == nvinfer1::DataType::kINT8 ? 4 : precision_size_);
+
+  PADDLE_ENFORCE_EQ(
+      weight.count % out_dim, 0,
+      platform::errors::InvalidArgument(
+          "The size of weight should be divided by output dimension."));
+  k_ = weight.count / out_dim;
+  PADDLE_ENFORCE_EQ(
+      weight.type == nvinfer1::DataType::kFLOAT ||
+          weight.type == nvinfer1::DataType::kHALF,
+      true,
+      platform::errors::InvalidArgument(
+          "SpmmPluginDynamic only supports weight of type [FLOAT|HALF]"));
+  nvinfer1::DataType weight_type;
+  if (precision_ == nvinfer1::DataType::kINT8) {
+    weight_type = nvinfer1::DataType::kFLOAT;
+  } else {
+    weight_type = precision_;
+  }
+  std::vector<char> weight_host(element_size_ * out_dim_ * k_);
+  convertAndCopy(weight, weight_type, weight_host.data());
+  void* weight_dev{nullptr};
+  cudaMalloc(reinterpret_cast<void**>(&weight_dev),
+             precision_size_ * out_dim_ * k_);
+  if (precision == nvinfer1::DataType::kINT8) {
+    float max_weight{0.0f};
+    for (int i = 0; i < weight.count; ++i) {
+      float local_abs =
+          std::abs(reinterpret_cast<const float*>(weight_host.data())[i]);
+      max_weight = std::max(max_weight, local_abs);
+    }
+    weight_scale_ = max_weight / 127.0f;
+    std::vector<int8_t> scale_buffer(weight.count);
+    for (int i = 0; i < weight.count; ++i) {
+      scale_buffer[i] = static_cast<int8_t>(
+          round_scale(reinterpret_cast<const float*>(weight_host.data())[i] /
+                      weight_scale_));
+    }
+    cudaMemcpy(weight_dev, scale_buffer.data(), precision_size_ * weight.count,
+               cudaMemcpyHostToDevice);
+  } else {
+    cudaMemcpy(weight_dev, weight_host.data(), precision_size_ * weight.count,
+               cudaMemcpyHostToDevice);
+  }
+  spmm_context_.compressMatB(out_dim_, k_, convertTrtType(precision_),
+                             weight_dev, &weight_compressed_dev_,
+                             &compressed_size_);
+  weight_compressed_ = new char[compressed_size_];
+  weight_compressed_dev_global_.reset(weight_compressed_dev_, cudaFreeFunc);
+  cudaMemcpy(weight_compressed_, weight_compressed_dev_global_.get(),
+             compressed_size_, cudaMemcpyDeviceToHost);
+  has_bias_ = (bias.count != 0);
+  if (has_bias_) {
+    if (bias.count != out_dim) {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "The dimension of bias should be equal to output dimension"));
+    }
+    if (precision_ == nvinfer1::DataType::kHALF) {
+      bias_ = new half[out_dim_];
+      convertAndCopy(bias, nvinfer1::DataType::kHALF, bias_);
+    } else {
+      bias_ = new float[out_dim_];
+      convertAndCopy(bias, nvinfer1::DataType::kFLOAT, bias_);
+    }
+  }
+
+  cudaFree(weight_dev);
+}
+
+// Constructor for clone
+SpmmPluginDynamic::SpmmPluginDynamic(const std::string& layer_name,
+                                     const nvinfer1::DataType precision,
+                                     const int out_dim, const int k,
+                                     const void* weight_compressed,
+                                     size_t compressed_size, const void* bias,
+                                     bool is_configured, const int m_max,
+                                     const int optim_alg, Activation activation)
+    : layer_name_(layer_name),
+      precision_(precision),
+      out_dim_(out_dim),
+      k_(k),
+      m_max_(m_max),
+      is_configured_(is_configured),
+      optim_alg_(optim_alg),
+      weight_scale_(1.0f),
+      weight_compressed_(nullptr),
+      weight_compressed_dev_global_(nullptr),
+      compressed_size_(compressed_size),
+      has_bias_(false),
+      bias_(nullptr),
+      bias_dev_(nullptr),
+      activation_(activation) {
+  /*
+  1. Copy the compressed weight (on host)
+  2. Copy the bias (on host)
+  3. (Configured) Copy the bias to device
+  4. (Configured) Init cuSPARSELt descriptors
+  */
+  precision_size_ = getElementSize(precision);
+  element_size_ =
+      (precision_ == nvinfer1::DataType::kINT8 ? 4 : precision_size_);
+  // Each plugin has a copy of compressed weight on host, while sharing the
+  // compressed weights on device using std::shared_ptr
+  weight_compressed_ = new char[compressed_size];
+  std::copy_n(static_cast<const char*>(weight_compressed), compressed_size,
+              static_cast<char*>(weight_compressed_));
+
+  has_bias_ = (bias != nullptr);
+  if (has_bias_) {
+    // Each plugin has a copy of bias
+    bias_ = new float[out_dim_];
+    std::copy_n(static_cast<const char*>(bias), sizeof(float) * out_dim_,
+                static_cast<char*>(bias_));
+    if (is_configured_) {
+      cudaMalloc(reinterpret_cast<void**>(&bias_dev_),
+                 sizeof(float) * out_dim_);
+      cudaMemcpy(bias_dev_, bias_, sizeof(float) * out_dim_,
+                 cudaMemcpyHostToDevice);
+    }
+  }
+
+  if (is_configured_) {
+    cudaDataType_t dataType = convertTrtType(precision_);
+    spmm_context_.init(m_max_, out_dim_, k_, dataType, bias_dev_, activation_);
+    spmm_context_.setAlgo(optim_alg_);
+  }
+}
+
+SpmmPluginDynamic::SpmmPluginDynamic(const std::string name, const void* data,
+                                     size_t length)
+    : layer_name_(name),
+      weight_compressed_(nullptr),
+      weight_compressed_dev_(nullptr),
+      weight_compressed_dev_global_(nullptr),
+      bias_(nullptr),
+      bias_dev_(nullptr) {
+  DeserializeValue(&data, &length, &precision_);
+  DeserializeValue(&data, &length, &precision_size_);
+  DeserializeValue(&data, &length, &element_size_);
+  DeserializeValue(&data, &length, &out_dim_);
+  DeserializeValue(&data, &length, &k_);
+  DeserializeValue(&data, &length, &m_max_);
+  DeserializeValue(&data, &length, &is_configured_);
+  DeserializeValue(&data, &length, &optim_alg_);
+  DeserializeValue(&data, &length, &weight_scale_);
+  DeserializeValue(&data, &length, &compressed_size_);
+  DeserializeValue(&data, &length, &has_bias_);
+  DeserializeValue(&data, &length, &activation_);
+
+  PADDLE_ENFORCE_EQ(is_configured_, true,
+                    platform::errors::InvalidArgument(
+                        "Deserialize data should be configured"));
+  weight_compressed_ = new char[compressed_size_];
+  deserialize_value_size(&data, &length, weight_compressed_, compressed_size_);
+  cudaMalloc(reinterpret_cast<void**>(&weight_compressed_dev_),
+             compressed_size_);
+  cudaMemcpy(weight_compressed_dev_, weight_compressed_, compressed_size_,
+             cudaMemcpyHostToDevice);
+  weight_compressed_dev_global_.reset(weight_compressed_dev_, cudaFreeFunc);
+
+  if (has_bias_) {
+    bias_ = new float[out_dim_];
+    deserialize_value_size(&data, &length, bias_, sizeof(float) * out_dim_);
+    cudaMalloc(reinterpret_cast<void**>(&bias_dev_), sizeof(float) * out_dim_);
+    cudaMemcpy(bias_dev_, bias_, sizeof(float) * out_dim_,
+               cudaMemcpyHostToDevice);
+  }
+
+  if (is_configured_) {
+    cudaDataType_t dataType = convertTrtType(precision_);
+    spmm_context_.init(m_max_, out_dim_, k_, dataType, bias_dev_, activation_);
+    spmm_context_.setAlgo(optim_alg_);
+  }
+}
+
+nvinfer1::IPluginV2DynamicExt* SpmmPluginDynamic::clone() const noexcept {
+  try {
+    auto* p =
+        new SpmmPluginDynamic(layer_name_, precision_, out_dim_, k_,
+                              weight_compressed_, compressed_size_, bias_,
+                              is_configured_, m_max_, optim_alg_, activation_);
+    p->weight_scale_ = weight_scale_;
+    p->weight_compressed_dev_global_ = weight_compressed_dev_global_;
+    p->setPluginNamespace(namespace_.c_str());
+    return p;
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+  return nullptr;
+}
+
+nvinfer1::DimsExprs SpmmPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) noexcept {
+  int nbDims = inputs[0].nbDims;
+  try {
+    PADDLE_ENFORCE_EQ(nbInputs, 1,
+                      platform::errors::InvalidArgument(
+                          "SpmmPluginDynamic's nbInputs is invalid"));
+    PADDLE_ENFORCE_EQ(outputIndex, 0,
+                      platform::errors::InvalidArgument(
+                          "SpmmPluginDynamic's outputIndex is invalid"));
+    if (nbDims == 5) {
+      int nbDims = inputs[0].nbDims;
+      PADDLE_ENFORCE_EQ(
+          inputs[0].d[3]->getConstantValue(), 1,
+          platform::errors::InvalidArgument("now the input d[3] should be 1"));
+      PADDLE_ENFORCE_EQ(
+          inputs[0].d[4]->getConstantValue(), 1,
+          platform::errors::InvalidArgument("now the input d[4] should be 1"));
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = nbDims;
+      ret.d[0] = inputs[0].d[0];
+      ret.d[1] = inputs[0].d[1];
+      ret.d[2] = exprBuilder.constant(out_dim_);
+      ret.d[3] = exprBuilder.constant(1);
+      ret.d[4] = exprBuilder.constant(1);
+      return ret;
+    } else if (nbDims == 4) {
+      int nbDims = inputs[0].nbDims;
+      PADDLE_ENFORCE_EQ(
+          inputs[0].d[2]->getConstantValue(), 1,
+          platform::errors::InvalidArgument("now the input d[2] should be 1"));
+      PADDLE_ENFORCE_EQ(
+          inputs[0].d[3]->getConstantValue(), 1,
+          platform::errors::InvalidArgument("now the input d[3] should be 1"));
+      nvinfer1::DimsExprs ret;
+      ret.nbDims = nbDims;
+      ret.d[0] = inputs[0].d[0];
+      ret.d[1] = exprBuilder.constant(out_dim_);
+      ret.d[2] = exprBuilder.constant(1);
+      ret.d[3] = exprBuilder.constant(1);
+
+      return ret;
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal("nbDims should be 4 or 5"));
+    }
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+  return nvinfer1::DimsExprs{};
+}
+
+bool SpmmPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) noexcept {
+  PADDLE_ENFORCE_EQ(nbInputs, 1,
+                    platform::errors::InvalidArgument(
+                        "SpmmPluginDynamic's nbInputs should be 1"));
+  PADDLE_ENFORCE_EQ(nbOutputs, 1,
+                    platform::errors::InvalidArgument(
+                        "SpmmPluginDynamic's nbOutputs should be 1"));
+
+  const nvinfer1::PluginTensorDesc& in = inOut[pos];
+  if (pos == 0) {
+    return (in.type == precision_) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+  }
+  const nvinfer1::PluginTensorDesc& prev = inOut[pos - 1];
+
+  return in.type == prev.type && in.format == prev.format;
+}
+
+void SpmmPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* outputs, int nbOutputs) noexcept {
+  /*
+  The following steps are executed if not configured.
+  1. (INT8) Scale the bias (on host)
+  2. Copy the bias to device
+  3. Search the optimal algorithm
+  */
+  try {
+    PADDLE_ENFORCE_EQ(nbInputs, 1,
+                      platform::errors::InvalidArgument(
+                          "SpmmPluginDynamic's nbInputs should be 1"));
+    PADDLE_ENFORCE_EQ(nbOutputs, 1,
+                      platform::errors::InvalidArgument(
+                          "SpmmPluginDynamic's nbOutputs should be 1"));
+    PADDLE_ENFORCE_EQ(precision_, inputs[0].desc.type,
+                      platform::errors::InvalidArgument(
+                          "precision_ should be equal to inputs[0].desc.type"));
+    const auto& inDims0 = inputs[0].desc.dims;
+    if (inDims0.nbDims == 5) {
+      PADDLE_ENFORCE_EQ(
+          inDims0.nbDims, 5,
+          platform::errors::InvalidArgument("inDims0.nbDims should be 5"));
+      PADDLE_ENFORCE_EQ(k_, inDims0.d[2],
+                        platform::errors::InvalidArgument(
+                            "inDims0.d[2] should be equals to k"));
+      PADDLE_ENFORCE_EQ(
+          inDims0.d[3], 1,
+          platform::errors::InvalidArgument("inDims0.d[3] should be 1"));
+      PADDLE_ENFORCE_EQ(
+          inDims0.d[4], 1,
+          platform::errors::InvalidArgument("inDims0.d[4] should be 1"));
+      const int BS = inputs->max.d[0];
+      const int Seq = inputs->max.d[1];
+      m_max_ = BS * Seq;
+    } else if (inDims0.nbDims == 4) {
+      PADDLE_ENFORCE_EQ(
+          inDims0.nbDims, 4,
+          platform::errors::InvalidArgument("inDims0.nbDims should be 4"));
+      PADDLE_ENFORCE_EQ(k_, inDims0.d[1],
+                        platform::errors::InvalidArgument(
+                            "inDims0.d[1] should be equals to k"));
+      PADDLE_ENFORCE_EQ(
+          inDims0.d[2], 1,
+          platform::errors::InvalidArgument("inDims0.d[2] should be 1"));
+      PADDLE_ENFORCE_EQ(
+          inDims0.d[3], 1,
+          platform::errors::InvalidArgument("inDims0.d[3] should be 1"));
+      const int BS_Seq = inputs->max.d[0];
+      m_max_ = BS_Seq;
+    }
+    if (is_configured_) {
+      return;
+    }
+
+    if (has_bias_) {
+      if (inputs->desc.type == nvinfer1::DataType::kINT8) {
+        for (int i = 0; i < out_dim_; ++i) {
+          static_cast<float*>(bias_)[i] =
+              static_cast<const float*>(bias_)[i] / outputs->desc.scale;
+        }
+      }
+      cudaMalloc(reinterpret_cast<void**>(&bias_dev_),
+                 sizeof(float) * out_dim_);
+      cudaMemcpy(bias_dev_, bias_, sizeof(float) * out_dim_,
+                 cudaMemcpyHostToDevice);
+    }
+    cudaDataType_t dataType = convertTrtType(precision_);
+    spmm_context_.init(m_max_, out_dim_, k_, dataType, bias_dev_, activation_);
+
+    void* dA;
+    void* dC;
+    void* d_workspace;
+    float alpha{1.0f};
+    float beta{0.0f};
+    if (precision_ == nvinfer1::DataType::kINT8) {
+      alpha = inputs->desc.scale * weight_scale_ / outputs->desc.scale;
+    }
+    cudaMalloc(reinterpret_cast<void**>(&dA), m_max_ * k_ * sizeof(dataType));
+    cudaMalloc(reinterpret_cast<void**>(&dC),
+               m_max_ * out_dim_ * sizeof(dataType));
+    cudaMalloc(reinterpret_cast<void**>(&d_workspace),
+               spmm_context_.workspace_size);
+    paddle::platform::dynload::cusparseLtMatmulSearch(
+        &spmm_context_.handle, &spmm_context_.plan, &alpha, dA,
+        weight_compressed_dev_global_.get(), &beta, dC, dC, d_workspace,
+        nullptr, 0);
+    paddle::platform::dynload::cusparseLtMatmulAlgGetAttribute(
+        &spmm_context_.handle, &spmm_context_.alg_sel,
+        CUSPARSELT_MATMUL_ALG_CONFIG_ID, &optim_alg_, sizeof(optim_alg_));
+    cudaFree(dA);
+    cudaFree(dC);
+    cudaFree(d_workspace);
+
+    is_configured_ = true;
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+}
+
+size_t SpmmPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const noexcept {
+  return spmm_context_.workspace_size;
+}
+
+int SpmmPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                               const nvinfer1::PluginTensorDesc* outputDesc,
+                               const void* const* inputs, void* const* outputs,
+                               void* workSpace, cudaStream_t stream) noexcept {
+  try {
+    PADDLE_ENFORCE_EQ(is_configured_, true,
+                      platform::errors::InvalidArgument(
+                          "The plugin is not configured before enqueue"));
+    if (inputDesc->dims.nbDims == 5) {
+      PADDLE_ENFORCE_EQ(
+          k_, inputDesc->dims.d[2],
+          platform::errors::InvalidArgument("k_ == inputDesc->dims.d[2]"));
+    } else if (inputDesc->dims.nbDims == 4) {
+      PADDLE_ENFORCE_EQ(
+          k_, inputDesc->dims.d[1],
+          platform::errors::InvalidArgument("k_ == inputDesc->dims.d[1]"));
+    }
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    if (inputDesc->type == nvinfer1::DataType::kFLOAT) {
+      const auto* const input = static_cast<const float*>(inputs[0]);
+      auto* output = static_cast<float*>(outputs[0]);
+      auto* weight_compressed_dev_p_ = weight_compressed_dev_global_.get();
+      cusparseStatus_t status = paddle::platform::dynload::cusparseLtMatmul(
+          &spmm_context_.handle, &spmm_context_.plan, &alpha, input,
+          weight_compressed_dev_p_, &beta, output, output, workSpace, &stream,
+          1);
+      return status != CUSPARSE_STATUS_SUCCESS;
+    } else if (inputDesc->type == nvinfer1::DataType::kHALF) {
+      const auto* const input = static_cast<const half*>(inputs[0]);
+      auto* output = static_cast<half*>(outputs[0]);
+      auto* weight_compressed_dev_p_ = weight_compressed_dev_global_.get();
+      cusparseStatus_t status = paddle::platform::dynload::cusparseLtMatmul(
+          &spmm_context_.handle, &spmm_context_.plan, &alpha, input,
+          weight_compressed_dev_p_, &beta, output, output, workSpace, &stream,
+          1);
+      return status != CUSPARSE_STATUS_SUCCESS;
+    } else if (inputDesc->type == nvinfer1::DataType::kINT8) {
+      alpha = inputDesc->scale * weight_scale_ / outputDesc->scale;
+      const auto* const input = static_cast<const int8_t*>(inputs[0]);
+      auto* output = static_cast<int8_t*>(outputs[0]);
+      auto* weight_compressed_dev_p_ = weight_compressed_dev_global_.get();
+      cusparseStatus_t status = paddle::platform::dynload::cusparseLtMatmul(
+          &spmm_context_.handle, &spmm_context_.plan, &alpha, input,
+          weight_compressed_dev_p_, &beta, output, output, workSpace, &stream,
+          1);
+      return status != CUSPARSE_STATUS_SUCCESS;
+    } else {
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "Unsupported type error, expected [kHALF,kFLOAT], but received %d",
+          static_cast<int>(precision_)));
+    }
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+  return -1;
+}
+
+nvinfer1::DataType SpmmPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes,
+    int nbInputs) const noexcept {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "SpmmPluginDynamic's index should be 0"));
+  PADDLE_ENFORCE_EQ(nbInputs, 1,
+                    platform::errors::InvalidArgument(
+                        "SpmmPluginDynamic's nbInputs should be 1"));
+  PADDLE_ENFORCE_EQ(inputTypes[0] == nvinfer1::DataType::kFLOAT ||
+                        inputTypes[0] == nvinfer1::DataType::kHALF ||
+                        inputTypes[0] == nvinfer1::DataType::kINT8,
+                    true,
+                    platform::errors::InvalidArgument(
+                        "SpmmPluginDynamic is not support this format now"));
+
+  return inputTypes[0];
+}
+
+const char* SpmmPluginDynamic::getPluginType() const noexcept {
+  return "SpmmPluginDynamic";
+}
+
+const char* SpmmPluginDynamic::getPluginVersion() const noexcept { return "1"; }
+
+int SpmmPluginDynamic::getNbOutputs() const noexcept { return 1; }
+
+int SpmmPluginDynamic::initialize() noexcept { return 0; }
+
+void SpmmPluginDynamic::terminate() noexcept {}
+
+size_t SpmmPluginDynamic::getSerializationSize() const noexcept {
+  return compressed_size_ + (has_bias_ ? sizeof(float) * out_dim_ : 0) +
+         sizeof(precision_) + sizeof(precision_size_) + sizeof(element_size_) +
+         sizeof(out_dim_) + sizeof(k_) + sizeof(m_max_) +
+         sizeof(is_configured_) + sizeof(optim_alg_) + sizeof(weight_scale_) +
+         sizeof(compressed_size_) + sizeof(has_bias_) + sizeof(activation_);
+}
+
+void SpmmPluginDynamic::serialize(void* buffer) const noexcept {
+  SerializeValue(&buffer, precision_);
+  SerializeValue(&buffer, precision_size_);
+  SerializeValue(&buffer, element_size_);
+  SerializeValue(&buffer, out_dim_);
+  SerializeValue(&buffer, k_);
+  SerializeValue(&buffer, m_max_);
+  SerializeValue(&buffer, is_configured_);
+  SerializeValue(&buffer, optim_alg_);
+  SerializeValue(&buffer, weight_scale_);
+  SerializeValue(&buffer, compressed_size_);
+  SerializeValue(&buffer, has_bias_);
+  SerializeValue(&buffer, activation_);
+  char* d = static_cast<char*>(buffer);
+  std::copy_n(static_cast<const char*>(weight_compressed_), compressed_size_,
+              d);
+  if (has_bias_) {
+    d += compressed_size_;
+    std::copy_n(static_cast<const char*>(bias_), out_dim_ * sizeof(float), d);
+  }
+}
+
+void SpmmPluginDynamic::destroy() noexcept {
+  delete[] reinterpret_cast<char*>(weight_compressed_);
+  if (has_bias_) {
+    cudaFree(bias_dev_);
+  }
+  if (is_configured_) {
+    spmm_context_.destroy();
+  }
+  delete this;
+}
+
+void SpmmPluginDynamic::setPluginNamespace(const char* libNamespace) noexcept {
+  try {
+    namespace_ = libNamespace;
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+}
+
+const char* SpmmPluginDynamic::getPluginNamespace() const noexcept {
+  return namespace_.c_str();
+}
+
+inline nvinfer1::DataType fieldTypeToDataType(
+    const nvinfer1::PluginFieldType ftype) {
+  switch (ftype) {
+    case nvinfer1::PluginFieldType::kFLOAT32:
+      return nvinfer1::DataType::kFLOAT;
+    case nvinfer1::PluginFieldType::kFLOAT16:
+      return nvinfer1::DataType::kHALF;
+    case nvinfer1::PluginFieldType::kINT32:
+      return nvinfer1::DataType::kINT32;
+    case nvinfer1::PluginFieldType::kINT8:
+      return nvinfer1::DataType::kINT8;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Fatal(
+          "No corresponding datatype for plugin field type"));
+  }
+}
+
+SpmmPluginDynamicCreator::SpmmPluginDynamicCreator() {
+  plugin_attr_.emplace_back(nvinfer1::PluginField(
+      "type_id", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+  plugin_attr_.emplace_back(nvinfer1::PluginField(
+      "out_dim", nullptr, nvinfer1::PluginFieldType::kINT32, 1));
+  plugin_attr_.emplace_back(nvinfer1::PluginField(
+      "weight", nullptr, nvinfer1::PluginFieldType::kFLOAT32, 1));
+  plugin_attr_.emplace_back(nvinfer1::PluginField(
+      "bias", nullptr, nvinfer1::PluginFieldType::kFLOAT32, 1));
+  plugin_attr_.emplace_back(nvinfer1::PluginField(
+      "activation_id", nullptr, nvinfer1::PluginFieldType::kINT8, 1));
+
+  field_collection_.nbFields = plugin_attr_.size();
+  field_collection_.fields = plugin_attr_.data();
+}
+
+const char* SpmmPluginDynamicCreator::getPluginName() const noexcept {
+  return "SpmmPluginDynamic";
+}
+
+const char* SpmmPluginDynamicCreator::getPluginVersion() const noexcept {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+SpmmPluginDynamicCreator::getFieldNames() noexcept {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2* SpmmPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) noexcept {
+  try {
+    int type_id = -1;
+    int out_dim = 0;
+    nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, nullptr, 0ll};
+    nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0ll};
+    int activation_id = -1;
+
+    for (int i = 0; i < fc->nbFields; i++) {
+      std::string field_name(fc->fields[i].name);
+      if (field_name.compare("type_id") == 0) {
+        type_id = static_cast<const int*>(fc->fields[i].data)[0];
+      } else if (field_name.compare("out_dim") == 0) {
+        out_dim = static_cast<const int*>(fc->fields[i].data)[0];
+      } else if (field_name.compare("weight") == 0) {
+        weight.type = fieldTypeToDataType(fc->fields[i].type);
+        weight.values = fc->fields[i].data;
+        weight.count = fc->fields[i].length;
+      } else if (field_name.compare("bias") == 0) {
+        bias.type = fieldTypeToDataType(fc->fields[i].type);
+        bias.values = fc->fields[i].data;
+        bias.count = fc->fields[i].length;
+      } else if (field_name.compare("activation_id") == 0) {
+        activation_id = static_cast<const int*>(fc->fields[i].data)[0];
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Fatal("Unsupport plugin field"));
+      }
+    }
+
+    PADDLE_ENFORCE_NE(
+        type_id, -1,
+        platform::errors::InvalidArgument(
+            "SpmmPluginDynamicCreator's type_id should not be -1"));
+    PADDLE_ENFORCE_NE(
+        out_dim, 0,
+        platform::errors::InvalidArgument(
+            "SpmmPluginDynamicCreator's out_dim should not be 0"));
+    PADDLE_ENFORCE_NE(
+        weight.count, 0,
+        platform::errors::InvalidArgument(
+            "SpmmPluginDynamicCreator's weight size should not be 0"));
+    PADDLE_ENFORCE_NE(
+        activation_id, -1,
+        platform::errors::InvalidArgument(
+            "SpmmPluginDynamicCreator's activation_id should not be -1"));
+    nvinfer1::DataType type = static_cast<nvinfer1::DataType>(type_id);
+    SpmmPluginDynamic::Activation activation =
+        static_cast<SpmmPluginDynamic::Activation>(activation_id);
+    return new SpmmPluginDynamic(name, type, out_dim, weight, bias, activation);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+  return nullptr;
+}
+
+nvinfer1::IPluginV2* SpmmPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) noexcept {
+  // This object will be deleted when the network is destroyed, which will
+  // call SpmmPluginDynamic::destroy()
+  try {
+    return new SpmmPluginDynamic(name, serialData, serialLength);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+  return nullptr;
+}
+
+void SpmmPluginDynamicCreator::setPluginNamespace(
+    const char* libNamespace) noexcept {
+  try {
+    namespace_ = libNamespace;
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << std::endl;
+  }
+}
+
+const char* SpmmPluginDynamicCreator::getPluginNamespace() const noexcept {
+  return namespace_.c_str();
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h
new file mode 100644
index 0000000000000..60c3773f93042
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2022, PaddlePaddle Authors, NVIDIA CORPORATION. All rights
+reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/dynload/cusparseLt.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class SpmmPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  enum class Activation { kNone, kRelu, kGelu };
+  SpmmPluginDynamic(const std::string& name, const nvinfer1::DataType precision,
+                    const int out_dim, const nvinfer1::Weights& weight,
+                    const nvinfer1::Weights& bias, Activation activation);
+  // The second constructor is for clone member function
+  SpmmPluginDynamic(const std::string& name, const nvinfer1::DataType precision,
+                    const int out_dim, const int k, const void* weight,
+                    size_t compressed_size, const void* bias,
+                    bool is_configured, const int m_max, const int optim_alg,
+                    Activation activation);
+  SpmmPluginDynamic(const std::string name, const void* data, size_t length);
+  SpmmPluginDynamic() = delete;
+  nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) noexcept override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) noexcept override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) noexcept override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const noexcept override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) noexcept override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const noexcept override;
+  const char* getPluginType() const noexcept override;
+  const char* getPluginVersion() const noexcept override;
+  int getNbOutputs() const noexcept override;
+  int initialize() noexcept override;
+  void terminate() noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void* buffer) const noexcept override;
+  void destroy() noexcept override;
+  void setPluginNamespace(const char* pluginNamespace) noexcept override;
+  const char* getPluginNamespace() const noexcept override;
+
+ private:
+  struct cusparseLtContext {
+    cusparseLtHandle_t handle;
+    cusparseLtMatDescriptor_t matA;
+    cusparseLtMatDescriptor_t matB;
+    cusparseLtMatDescriptor_t matC;
+    cusparseLtMatmulDescriptor_t matmul;
+    cusparseLtMatmulAlgSelection_t alg_sel;
+    cusparseLtMatmulPlan_t plan;
+    cusparseLtContext();
+    ~cusparseLtContext();
+    size_t workspace_size{0};
+    bool is_initialized{false};
+    int activation{0};
+    float relu_upper_bound{0};
+    float relu_threshold{0};
+    void init(int m, int n, int k, cudaDataType_t type, void* bias_ptr,
+              SpmmPluginDynamic::Activation activation);
+    void setAlgo(int id);
+    void destroy();
+    void compressMatB(int n, int k, cudaDataType_t type, void* src, void** dest,
+                      size_t* compressed_size);
+  };  // struct SpmmPluginDynamic::cusparseLtContext
+  const std::string layer_name_;
+  std::string namespace_;
+  nvinfer1::DataType precision_;
+  size_t precision_size_;
+  size_t
+      element_size_;  // size of weight (float if INT8 or FLOAT; half if HALF)
+  int out_dim_;
+  int k_;
+  int m_max_;
+  bool is_configured_;  // already get m, scale bias, and search the optim alg
+                        // or not
+  int optim_alg_;       // the index of optimal algorithm
+  float weight_scale_;  // record the weight scale from constructor
+  void* weight_compressed_;      // host compressed weight
+  void* weight_compressed_dev_;  //  device compressed weight
+  std::shared_ptr<void>
+      weight_compressed_dev_global_;  // shared pointer to the
+                                      // device compressed weight
+  size_t compressed_size_;            // size of compressed weight
+  bool has_bias_;                     // there is bias or not
+  void* bias_;                        // host bias
+  void* bias_dev_;                    // device bias
+  Activation activation_;             // record the activation type
+  cusparseLtContext spmm_context_;
+};  // class SpmmPluginDynamic
+
+class SpmmPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  SpmmPluginDynamicCreator();
+  const char* getPluginName() const noexcept override;
+  const char* getPluginVersion() const noexcept override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) noexcept override;
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serialData,
+                                         size_t serialLength) noexcept override;
+  void setPluginNamespace(const char* pluginNamespace) noexcept override;
+  const char* getPluginNamespace() const noexcept override;
+
+ private:
+  static nvinfer1::PluginFieldCollection field_collection_;
+  static std::vector<nvinfer1::PluginField> plugin_attr_;
+  std::string namespace_;
+};  // class SpmmPluginDynamicCreator
+
+REGISTER_TRT_PLUGIN_V2(SpmmPluginDynamicCreator);
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
new file mode 100644
index 0000000000000..4f0d7fb1e9e51
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -0,0 +1,175 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
+#include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h"
+#endif
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/common/float16.h"
+
+using float16 = phi::dtype::float16;
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class TensorRTDynamicEngineTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->SetPinnedAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CUDAPinnedPlace())
+            .get());
+    ctx_->PartialInitWithAllocator();
+
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"input", {16, 32, 1, 1}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"input", {16, 32, 1, 1}}};
+    std::map<std::string, std::vector<int>> optim_input_shape = {
+        {"input", {16, 32, 1, 1}}};
+
+    engine_ =
+        new TensorRTEngine(16, 1 << 10, AnalysisConfig::Precision::kHalf,
+                           nullptr, 0, min_input_shape, max_input_shape,
+                           optim_input_shape, false, NaiveLogger::Global());
+    engine_->InitNetwork();
+  }
+
+  void TearDown() override {
+    if (engine_) {
+      delete engine_;
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<float16> &input,
+                          std::vector<int> output_shape) {
+    paddle::framework::TensorFromVector(input, *ctx_, &input_);
+    output_.Resize(phi::make_ddim(output_shape));
+  }
+
+  void GetOutput(std::vector<float> *output) {
+    paddle::framework::TensorToVector(output_, *ctx_, output);
+  }
+
+ protected:
+  framework::Tensor input_;
+  framework::Tensor output_;
+  TensorRTEngine *engine_;
+  platform::CUDADeviceContext *ctx_;
+};
+
+TEST_F(TensorRTDynamicEngineTest, test_spmm) {
+  // Weight in CPU memory.
+#if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
+  float16 raw_weight[512];
+  for (int i = 0; i < 128; i++) {
+    if (i % 16 <= 7) {
+      raw_weight[4 * i] = float16(1.0);
+      raw_weight[4 * i + 1] = float16(0.0);
+      raw_weight[4 * i + 2] = float16(0.0);
+      raw_weight[4 * i + 3] = float16(4.0);
+    } else {
+      raw_weight[4 * i] = float16(0.0);
+      raw_weight[4 * i + 1] = float16(2.0);
+      raw_weight[4 * i + 2] = float16(3.0);
+      raw_weight[4 * i + 3] = float16(0.0);
+    }
+  }
+  float16 raw_bias[16] = {float16(0), float16(1), float16(0), float16(2),
+                          float16(0), float16(3), float16(0), float16(4),
+                          float16(0), float16(5), float16(0), float16(6),
+                          float16(0), float16(7), float16(0), float16(8)};
+  std::vector<void *> buffers(2);  // TRT binded inputs
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kHALF, raw_weight, 512);
+  TensorRTEngine::Weight bias(nvinfer1::DataType::kHALF, raw_bias, 16);
+  std::cout << "with_dynamic_shape: " << engine_->with_dynamic_shape()
+            << std::endl;
+  auto *x = engine_->DeclareInput("input", nvinfer1::DataType::kHALF,
+                                  nvinfer1::Dims4{-1, 32, 1, 1});
+
+  plugin::SpmmPluginDynamic::Activation act =
+      plugin::SpmmPluginDynamic::Activation::kNone;
+
+  plugin::SpmmPluginDynamic *plugin = new plugin::SpmmPluginDynamic(
+      "CustomSpmmPluginDynamic", nvinfer1::DataType::kHALF, 16, weight.get(),
+      bias.get(), act);
+  std::vector<nvinfer1::ITensor *> plugin_inputs;
+  plugin_inputs.emplace_back(x);
+  auto fc_layer = engine_->network()->addPluginV2(
+      plugin_inputs.data(), plugin_inputs.size(), *plugin);
+
+  LOG(INFO) << "create weights";
+  PADDLE_ENFORCE_NOT_NULL(fc_layer, platform::errors::InvalidArgument(
+                                        "TRT SPMM layer building failed."));
+
+  engine_->DeclareOutput(fc_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  std::vector<float16> x_v(512);
+  for (int i = 0; i < 128; i++) {
+    x_v[4 * i] = float16(1.0);
+    x_v[4 * i + 1] = float16(2.0);
+    x_v[4 * i + 2] = float16(3.0);
+    x_v[4 * i + 3] = float16(4.0);
+  }
+
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {16, 16});
+
+  auto *x_v_gpu_data = input_.mutable_data<float16>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(16, &buffers, ctx_->stream());
+  LOG(INFO) << "to get output";
+  GetOutput(&y_cpu);
+
+  auto dims = engine_->GetITensor("y")->getDimensions();
+  ASSERT_EQ(dims.nbDims, 4);
+  ASSERT_EQ(dims.d[1], 16);
+  ASSERT_EQ(y_cpu[0], 136);
+
+  ASSERT_EQ(y_cpu[1], 105);
+  ASSERT_EQ(y_cpu[32], 136);
+  ASSERT_EQ(y_cpu[64], 136);
+  ASSERT_EQ(y_cpu[96], 136);
+#endif
+  return;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index c9298692334c0..87b443278a6d8 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -174,67 +174,67 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
 }
 
 #ifdef PADDLE_WITH_GPU
-TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
-  int thread_num = 4;
-
-  // init stream
-  std::vector<cudaStream_t> streams(thread_num);
-  for (size_t i = 0; i < thread_num; ++i) {
-    cudaStreamCreate(&streams[i]);
-  }
-
-  // init input data
-  std::map<std::string, paddle::test::Record> my_input_data_map;
-  my_input_data_map["x"] = PrepareInput(2);
-  // init output data
-  std::map<std::string, paddle::test::Record> infer_output_data,
-      truth_output_data;
-  // prepare groudtruth config
-  paddle_infer::Config config, config_no_ir;
-  config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
-                        FLAGS_modeldir + "/inference.pdiparams");
-  config_no_ir.SwitchIrOptim(false);
-  // prepare inference config
-  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
-                  FLAGS_modeldir + "/inference.pdiparams");
-  config.EnableUseGpu(100, 0);
-  config.EnableTensorRtEngine(
-      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
-  // get groudtruth by disbale ir
-
-  paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
-  SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
-                         &truth_output_data, 1);
-
-  // get infer results from multi threads
-  std::vector<std::thread> threads;
-  config.SetExecStream(streams[0]);
-  config.pass_builder()->DeletePass("add_support_int8_pass");
-  auto main_predictor = CreatePredictor(config);
-  std::vector<decltype(main_predictor)> predictors;
-  for (size_t i = 0; i < thread_num - 1; ++i) {
-    predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
-    LOG(INFO) << "predictors[" << i << "] stream is "
-              << predictors[i]->GetExecStream();
-  }
-  predictors.push_back(std::move(main_predictor));
-  LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
-            << predictors[thread_num - 1]->GetExecStream();
-  for (int i = 0; i < thread_num; ++i) {
-    threads.emplace_back(paddle::test::SingleThreadPrediction,
-                         predictors[i].get(), &my_input_data_map,
-                         &infer_output_data, 10);
-  }
-
-  // thread join & check outputs
-  for (int i = 0; i < thread_num; ++i) {
-    LOG(INFO) << "join tid : " << i;
-    threads[i].join();
-    // CompareRecord(&truth_output_data, &infer_output_data);
-  }
-
-  std::cout << "finish multi-thread test" << std::endl;
-}
+// TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
+//   int thread_num = 4;
+
+//   // init stream
+//   std::vector<cudaStream_t> streams(thread_num);
+//   for (size_t i = 0; i < thread_num; ++i) {
+//     cudaStreamCreate(&streams[i]);
+//   }
+
+//   // init input data
+//   std::map<std::string, paddle::test::Record> my_input_data_map;
+//   my_input_data_map["x"] = PrepareInput(2);
+//   // init output data
+//   std::map<std::string, paddle::test::Record> infer_output_data,
+//       truth_output_data;
+//   // prepare groudtruth config
+//   paddle_infer::Config config, config_no_ir;
+//   config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+//                         FLAGS_modeldir + "/inference.pdiparams");
+//   config_no_ir.SwitchIrOptim(false);
+//   // prepare inference config
+//   config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+//                   FLAGS_modeldir + "/inference.pdiparams");
+//   config.EnableUseGpu(100, 0);
+//   config.EnableTensorRtEngine(
+//       1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
+//   // get groudtruth by disbale ir
+
+//   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
+//   SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
+//                          &truth_output_data, 1);
+
+//   // get infer results from multi threads
+//   std::vector<std::thread> threads;
+//   config.SetExecStream(streams[0]);
+//   config.pass_builder()->DeletePass("add_support_int8_pass");
+//   auto main_predictor = CreatePredictor(config);
+//   std::vector<decltype(main_predictor)> predictors;
+//   for (size_t i = 0; i < thread_num - 1; ++i) {
+//     predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
+//     LOG(INFO) << "predictors[" << i << "] stream is "
+//               << predictors[i]->GetExecStream();
+//   }
+//   predictors.push_back(std::move(main_predictor));
+//   LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
+//             << predictors[thread_num - 1]->GetExecStream();
+//   for (int i = 0; i < thread_num; ++i) {
+//     threads.emplace_back(paddle::test::SingleThreadPrediction,
+//                          predictors[i].get(), &my_input_data_map,
+//                          &infer_output_data, 10);
+//   }
+
+//   // thread join & check outputs
+//   for (int i = 0; i < thread_num; ++i) {
+//     LOG(INFO) << "join tid : " << i;
+//     threads[i].join();
+//     CompareRecord(&truth_output_data, &infer_output_data);
+//   }
+
+//   std::cout << "finish multi-thread test" << std::endl;
+// }
 #endif
 
 }  // namespace paddle_infer
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index d72af70657a29..13536be5b40fe 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -419,21 +419,12 @@ class AllocatorFacadePrivate {
     const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
         GetDefaultStreamSafeCUDAAllocator(place);
 
-    // NOTE(Ruibiao): The default stream will be set when the CUDADeviceContext
-    // created. Normally, the DeviceContextPool is a global singleton and one
-    // Place only correspond to one DeviceContext. However, to support
-    // multi-stream scheduling, standalone executor creates two extra
-    // DeviceContextPools for H2D and D2H stream in StreamAnalyzer, which make
-    // one Place correspond to multiple DeviceContext and unexpectedly reset the
-    // default stream in runtime. To avoid this behavior, we do not allow
-    // changing default stream after initially setting.
-    if (allocator->GetDefaultStream() != nullptr) {
-      VLOG(5) << "The default stream for StreamSafeCUDAAllocator("
-              << allocator.get() << ") in " << place << " has been set to "
-              << allocator->GetDefaultStream()
-              << " before, not allow to change now.";
-      return;
-    }
+    PADDLE_ENFORCE_EQ(
+        allocator->GetDefaultStream(), nullptr,
+        platform::errors::Unavailable(
+            "The default stream for StreamSafeCUDAAllocator(%p) in %s has been "
+            "set to %p, not allow to change it to %p.",
+            allocator.get(), place, allocator->GetDefaultStream(), stream));
 
     allocator->SetDefaultStream(stream);
     VLOG(8) << "Set default stream to " << stream
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index b2fd59b47454e..eb0664eb17d35 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -108,7 +108,7 @@ register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combin
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 target_link_libraries(run_program_op cuda_graph_with_memory_pool)
-op_library(quantize_linear_op DEPS cast_kernel)
+op_library(quantize_linear_op DEPS phi)
 op_library(save_combine_op DEPS string_array)
 op_library(load_combine_op DEPS string_array)
 
diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc
index f66b75fd1f319..90d0a72074b81 100644
--- a/paddle/fluid/operators/activation_op_mlu.cc
+++ b/paddle/fluid/operators/activation_op_mlu.cc
@@ -108,6 +108,43 @@ class ActivationGradMLUKernelV3 : public framework::OpKernel<T> {
   }
 };
 
+// For sqrt
+template <typename T>
+class SqrtMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    MLUCnnlTensorDesc input_desc(*x);
+    MLUCnnlTensorDesc output_desc(*out);
+
+    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
+    MLUCnnl::Sqrt(ctx, prefer, input_desc.get(), GetBasePtr(x),
+                  output_desc.get(), GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class SqrtGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    MLUCnnlTensorDesc data_desc(*out);
+    MLUCnnl::SqrtGrad(ctx, data_desc.get(), GetBasePtr(out), GetBasePtr(dout),
+                      GetBasePtr(dx));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -170,3 +207,9 @@ REGISTER_OP_MLU_KERNEL(
     ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU, float>,
     ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU,
                                    paddle::platform::float16>);
+
+// sqrt
+REGISTER_OP_MLU_KERNEL(sqrt, ops::SqrtMLUKernel<float>,
+                       ops::SqrtMLUKernel<paddle::platform::float16>);
+REGISTER_OP_MLU_KERNEL(sqrt_grad, ops::SqrtGradMLUKernel<float>,
+                       ops::SqrtGradMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
index 1d3e5e5162ca9..8bf1398f607c8 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -15,9 +15,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/float16.h"
+
 namespace paddle {
 namespace operators {
+
 template <typename T>
 class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
   using MPDType = typename details::MPTypeTrait<T>::Type;
@@ -38,6 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
     // cpy to cpu
     bool cpu_found_inf_data = false;
 
+    // number of inf and nans
+    int nums_inf_nans = 0;
     MPDType cpu_scale_data;
     if (platform::is_xpu_place(scale->place())) {
       memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
@@ -52,48 +57,21 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(dev_ctx.GetPlace());
-      framework::Tensor is_finite =
-          ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
-                                                                  dev_ctx);
-      framework::Tensor is_nan =
-          ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
-                                                                  dev_ctx);
-      framework::Tensor is_finite_and_nan =
-          ctx.AllocateTmpTensor<bool, platform::XPUDeviceContext>(x->dims(),
-                                                                  dev_ctx);
-      if (cpu_found_inf_data == false) {
-        int r = xpu::isfinite(dev_ctx.x_context(),
-                              reinterpret_cast<const XPUTyp*>(x->data<T>()),
-                              is_finite.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(isfinite) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
-        r = xpu::logical_not(
-            dev_ctx.x_context(),
-            reinterpret_cast<const bool*>(is_finite.data<bool>()),
-            is_finite.data<bool>(), x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(logical_not) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
-        r = xpu::any(dev_ctx.x_context(), is_finite.data<bool>(),
-                     found_inf_data, x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(any) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
-        if (dev_ctx.x_context()->xpu_stream) {
-          dev_ctx.Wait();
-        }
-        memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
-                     dev_ctx.GetPlace(), found_inf_data, sizeof(bool));
+      framework::Tensor inf_nan_count =
+          ctx.AllocateTmpTensor<int, platform::XPUDeviceContext>(
+              found_inf->dims(), dev_ctx);
+
+      if (nums_inf_nans == 0) {
+        int r = xpu::count_nan_or_inf(
+            dev_ctx.x_context(), reinterpret_cast<const XPUTyp*>(x->data<T>()),
+            inf_nan_count.data<int>(), x->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
+        memory::Copy(platform::CPUPlace(), &nums_inf_nans, dev_ctx.GetPlace(),
+                     inf_nan_count.data<int>(), sizeof(int));
       }
 
-      if (cpu_found_inf_data) {
+      if (nums_inf_nans > 0) {
+        cpu_found_inf_data = true;
         inverse_scale = 0.0;
       }
 
@@ -109,45 +87,25 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
         int r = xpu::cast_v2(dev_ctx.x_context(),
                              reinterpret_cast<const float16*>(x->data<T>()),
                              float_x.data<MPDType>(), x->numel());
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(cast_v2) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
 
         r = xpu::scale(dev_ctx.x_context(), float_x.data<MPDType>(),
                        float_out.data<MPDType>(), x->numel(), false,
                        inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(scale) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
 
         r = xpu::cast_v2(dev_ctx.x_context(), float_out.data<MPDType>(),
                          reinterpret_cast<float16*>(out->data<T>()),
                          out->numel());
-
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(cast_v2) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
       } else {
         int r = xpu::scale(dev_ctx.x_context(),
                            reinterpret_cast<const XPUTyp*>(x->data<T>()),
                            reinterpret_cast<XPUTyp*>(out->data<T>()),
                            x->numel(), false, inverse_scale, 0.0);
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API(scale) return wrong "
-                                       "value[%d %s]",
-                                       r, XPUAPIErrorMsg[r]));
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
       }
     }
-    if (dev_ctx.x_context()->xpu_stream) {
-      dev_ctx.Wait();
-    }
     memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(),
                  &cpu_found_inf_data, sizeof(bool));
   }
diff --git a/paddle/fluid/operators/clip_op_mlu.cc b/paddle/fluid/operators/clip_op_mlu.cc
new file mode 100644
index 0000000000000..88e8fe778dadc
--- /dev/null
+++ b/paddle/fluid/operators/clip_op_mlu.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ClipMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto min = static_cast<T>(ctx.Attr<float>("min"));
+    auto max = static_cast<T>(ctx.Attr<float>("max"));
+
+    if (ctx.HasInput("Min")) {
+      Tensor min_cpu;
+      auto* min_tensor = ctx.Input<Tensor>("Min");
+      auto* min_data = min_tensor->data<T>();
+      if (platform::is_mlu_place(min_tensor->place())) {
+        paddle::framework::TensorCopySync(*min_tensor, platform::CPUPlace(),
+                                          &min_cpu);
+        min_data = min_cpu.data<T>();
+      }
+      min = min_data[0];
+    }
+
+    if (ctx.HasInput("Max")) {
+      Tensor max_cpu;
+      auto* max_tensor = ctx.Input<Tensor>("Max");
+      auto* max_data = max_tensor->data<T>();
+      if (platform::is_mlu_place(max_tensor->place())) {
+        paddle::framework::TensorCopySync(*max_tensor, platform::CPUPlace(),
+                                          &max_cpu);
+        max_data = max_cpu.data<T>();
+      }
+      max = max_data[0];
+    }
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Clip(ctx, x_desc.get(), GetBasePtr(x),
+                  static_cast<const void*>(&min),
+                  static_cast<const void*>(&max), GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class ClipGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    auto* min_tensor = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min") : nullptr;
+    auto* max_tensor = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max") : nullptr;
+
+    auto min_val = ctx.Attr<float>("min");
+    if (min_tensor) {
+      Tensor min_data;
+      framework::TensorCopy(
+          *min_tensor, platform::CPUPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &min_data);
+      ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
+      min_val = static_cast<float>(min_data.data<T>()[0]);
+    }
+    auto max_val = ctx.Attr<float>("max");
+    if (max_tensor) {
+      Tensor max_data;
+      framework::TensorCopy(
+          *max_tensor, platform::CPUPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &max_data);
+      ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
+      max_val = static_cast<float>(max_data.data<T>()[0]);
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc dx_desc(*dx);
+    MLUCnnlTensorDesc dout_desc(*dout);
+
+    MLUCnnl::HardtanhBackward(ctx, x_desc.get(), GetBasePtr(x), dout_desc.get(),
+                              GetBasePtr(dout), max_val, min_val, dx_desc.get(),
+                              GetBasePtr(dx));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(clip, ops::ClipMLUKernel<float>,
+                       ops::ClipMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(clip_grad, ops::ClipGradMLUKernel<float>,
+                       ops::ClipGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/conv_transpose_op_mlu.cc b/paddle/fluid/operators/conv_transpose_op_mlu.cc
new file mode 100644
index 0000000000000..160c16c3de995
--- /dev/null
+++ b/paddle/fluid/operators/conv_transpose_op_mlu.cc
@@ -0,0 +1,266 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/conv_transpose_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    Tensor* output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> output_padding =
+        ctx.Attr<std::vector<int>>("output_padding");
+    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    const std::string data_format = ctx.Attr<std::string>("data_format");
+    int groups = ctx.Attr<int>("groups");
+    const std::string padding_algorithm =
+        ctx.Attr<std::string>("padding_algorithm");
+
+    // check dimension
+    const bool channel_last = data_format == "NHWC";
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto in_dims_size = in_dims.size();
+    framework::DDim in_data_dims;
+    framework::DDim filter_data_dims;
+
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
+
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
+
+    Tensor input_tensor(input->type());
+    Tensor output_tensor(output->type());
+    input_tensor.set_layout(DataLayout::kNHWC);
+    output_tensor.set_layout(DataLayout::kNHWC);
+    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
+
+    if (channel_last) {
+      input_tensor.ShareDataWith(*input);
+      output_tensor.ShareDataWith(*output);
+    } else {
+      // transpose input from NCHW to NHWC
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, input, &input_tensor,
+                                true /*need_reshape_or_alloc*/);
+      auto output_dims = output->dims();
+      output_tensor.mutable_data<T>(
+          {output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
+          ctx.GetPlace());
+    }
+
+    // transpose filter from MCHW to MHWC
+    Tensor trans_filter(filter->type());
+    TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, filter, &trans_filter,
+                              true /*need_reshape_or_alloc*/);
+
+    // construct MLU attr
+    cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
+    MLUCnnlTensorDesc input_desc(input_tensor, data_layout,
+                                 ToCnnlDataType(input_tensor.dtype()));
+    MLUCnnlTensorDesc filter_desc(trans_filter, data_layout,
+                                  ToCnnlDataType(trans_filter.type()));
+    MLUCnnlTensorDesc output_desc(output_tensor, data_layout,
+                                  ToCnnlDataType(output_tensor.dtype()));
+    MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                     strides.data(), dilations.data(), groups,
+                                     ToCnnlDataType<T>());
+
+    MLUCnnl::ConvBackpropInput(ctx, conv_desc.get(), filter_desc.get(),
+                               GetBasePtr(&trans_filter), input_desc.get(),
+                               GetBasePtr(&input_tensor), output_desc.get(),
+                               GetBasePtr(&output_tensor));
+
+    if (!channel_last) {
+      // transpose output from NHWC to NCHW
+      const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
+      TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &output_tensor, output,
+                                false /*need_reshape_or_alloc*/);
+    }
+  }
+};
+
+template <typename T>
+class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const Tensor* filter = ctx.Input<Tensor>("Filter");
+    const Tensor* output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    if ((!input_grad) && (!filter_grad)) return;
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    const int groups = ctx.Attr<int>("groups");
+    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
+    const std::string data_format = ctx.Attr<std::string>("data_format");
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+
+    auto in_dims = input->dims();
+    auto filter_dims = filter->dims();
+    auto in_dims_size = in_dims.size();
+
+    const bool channel_last = (data_layout == framework::DataLayout::kNHWC);
+
+    framework::DDim in_data_dims;
+    if (channel_last) {
+      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    } else {
+      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
+    }
+    framework::DDim filter_data_dims =
+        phi::slice_ddim(filter_dims, 2, filter_dims.size());
+    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
+    phi::UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                                  in_data_dims, strides, ksize);
+
+    Tensor input_tensor(input->type());
+    Tensor output_grad_tensor(output_grad->type());
+    output_grad_tensor.set_layout(DataLayout::kNHWC);
+
+    const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
+    if (channel_last) {
+      input_tensor.ShareDataWith(*input);
+      output_grad_tensor.ShareDataWith(*output_grad);
+    } else {
+      // transpose input from NCHW to NHWC
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, input, &input_tensor,
+                                true /*need_reshape_or_alloc*/);
+      TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, output_grad,
+                                &output_grad_tensor,
+                                true /*need_reshape_or_alloc*/);
+    }
+
+    // transpose filter from MCHW to MHWC
+    Tensor trans_filter(filter->type());
+    TransposeFromMLUTensor<T>(ctx, perm_to_nhwc, filter, &trans_filter,
+                              true /*need_reshape_or_alloc*/);
+
+    // MLU descs
+    cnnlTensorLayout_t data_layout_mlu = CNNL_LAYOUT_NHWC;
+    MLUCnnlTensorDesc input_desc(input_tensor, data_layout_mlu,
+                                 ToCnnlDataType(input_tensor.dtype()));
+    MLUCnnlTensorDesc trans_filter_desc(trans_filter, data_layout_mlu,
+                                        ToCnnlDataType(trans_filter.type()));
+    MLUCnnlTensorDesc output_grad_desc(
+        output_grad_tensor, data_layout_mlu,
+        ToCnnlDataType(output_grad_tensor.dtype()));
+    MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(),
+                                     strides.data(), dilations.data(), groups,
+                                     ToCnnlDataType<T>());
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(ctx.GetPlace());
+      Tensor filter_grad_tensor(filter_grad->type());
+      // filter_grad always MCHW
+      // filter_grad_tensor always MHWC
+      auto filter_grad_dims = filter_grad->dims();
+      filter_grad_tensor.mutable_data<T>(
+          {filter_grad_dims[0], filter_grad_dims[2], filter_grad_dims[3],
+           filter_grad_dims[1]},
+          ctx.GetPlace());
+      //}
+      filter_grad_tensor.set_layout(DataLayout::kNHWC);
+
+      MLUCnnlTensorDesc filter_grad_desc(
+          filter_grad_tensor, data_layout_mlu,
+          ToCnnlDataType(filter_grad_tensor.dtype()));
+
+      MLUCnnl::ConvBackpropFilter(
+          ctx, conv_desc.get(), output_grad_desc.get(), GetBasePtr(output_grad),
+          input_desc.get(), GetBasePtr(&input_tensor), filter_grad_desc.get(),
+          GetBasePtr(&filter_grad_tensor));
+      // transpose output from MHWC to MCHW
+      const std::vector<int> perm_to_mchw = {0, 3, 1, 2};
+      TransposeFromMLUTensor<T>(ctx, perm_to_mchw, &filter_grad_tensor,
+                                filter_grad, false /*need_reshape_or_alloc*/);
+    }
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      Tensor input_grad_tensor(input_grad->type());
+      input_tensor.set_layout(DataLayout::kNHWC);
+
+      if (channel_last) {
+        input_grad_tensor.ShareDataWith(*input_grad);
+      } else {
+        auto input_grad_dims = input_grad->dims();
+        input_grad_tensor.mutable_data<T>(
+            {input_grad_dims[0], input_grad_dims[2], input_grad_dims[3],
+             input_grad_dims[1]},
+            ctx.GetPlace());
+      }
+
+      MLUCnnlTensorDesc input_grad_desc(
+          input_grad_tensor, data_layout_mlu,
+          ToCnnlDataType(input_grad_tensor.dtype()));
+
+      cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
+      cnnlDataType_t dt_onchip = ToCnnlDataType<T>();
+      MLUCnnl::Conv2D(ctx, conv_desc.get(), tensor_dtype, dt_onchip,
+                      nullptr /* input_position */, nullptr /* input_scale */,
+                      nullptr /* input_offset */, nullptr /* filter_position */,
+                      nullptr /* filter_scale */, nullptr /* filter_offset */,
+                      output_grad_desc.get(), GetBasePtr(&output_grad_tensor),
+                      trans_filter_desc.get(), GetBasePtr(&trans_filter),
+                      nullptr /* bias_desc*/, nullptr /* bias */,
+                      input_grad_desc.get(), GetBasePtr(&input_grad_tensor));
+      if (!channel_last) {
+        // transpose output from NHWC to NCHW
+        const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
+        TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &input_grad_tensor,
+                                  input_grad, false /*need_reshape_or_alloc*/);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(conv2d_transpose, ops::Conv2DTransposeMLUKernel<float>,
+                       ops::Conv2DTransposeMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(conv2d_transpose_grad,
+                       ops::Conv2DTransposeGradMLUKernel<float>,
+                       ops::Conv2DTransposeGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cum_op.cc
similarity index 51%
rename from paddle/fluid/operators/cumsum_op.cc
rename to paddle/fluid/operators/cum_op.cc
index dbb703e7e874d..be001c43086cf 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -49,7 +49,7 @@ class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.
 By default, the first element of the result is the same of the first element of
-the input. If exlusive is true, the first element of the result is 0.
+the input. If exclusive is true, the first element of the result is 0.
 )DOC");
   }
 };
@@ -74,17 +74,87 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+class LogcumsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of logcumsumexp operator");
+    AddOutput("Out", "Output of logcumsumexp operator");
+    AddAttr<int>("axis",
+                 "The dimension to accumulate along. -1 means the last "
+                 "dimension [default -1].")
+        .SetDefault(-1);
+    AddAttr<bool>("flatten",
+                  "Whether to compute the logcumsumexp over the flattened array. "
+                  "[default false].")
+        .SetDefault(false);
+    AddAttr<bool>("exclusive",
+                  "Whether to perform exclusive logcumsumexp. [default false].")
+        .SetDefault(false);
+    AddAttr<bool>("reverse",
+                  "If true, the logcumsumexp is performed in the reversed direction. "
+                  "[default false].")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Returns the logarithm of the cumulative summation of the exponentiation of elements of input along the given axis.
+By default, the first element of the result is the same of the first element of
+the input. If exclusive is true, the first element of the result is the lowest finite value of the dtype of output tensor.
+)DOC");
+  }
+};
+
+class LogcumsumexpGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logcumsumexp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logcumsumexp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "logcumsumexp");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+template <typename T>
+class LogcumsumexpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("logcumsumexp_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput("Out", this->Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttr("axis", BOOST_GET_CONST(int, this->GetAttr("axis")));
+    grad_op->SetAttr("flatten",
+                     BOOST_GET_CONST(bool, this->GetAttr("flatten")));
+    grad_op->SetAttr("exclusive",
+                     BOOST_GET_CONST(bool, this->GetAttr("exclusive")));
+    grad_op->SetAttr("reverse",
+                     BOOST_GET_CONST(bool, this->GetAttr("reverse")));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor,
-                            PD_INFER_META(phi::CumsumInferMeta));
+                            PD_INFER_META(phi::CumInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp, LogcumsumexpInferShapeFunctor,
+                            PD_INFER_META(phi::CumInferMeta));
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
                   ops::CumsumGradMaker<paddle::imperative::OpBase>,
                   CumsumInferShapeFunctor);
+REGISTER_OPERATOR(logcumsumexp, ops::CumOp, ops::LogcumsumexpOpMaker,
+                  ops::LogcumsumexpGradMaker<paddle::framework::OpDesc>,
+                  ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>,
+                  LogcumsumexpInferShapeFunctor);
+REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp);
 
 REGISTER_OP_VERSION(cumsum).AddCheckpoint(
     R"ROC(
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
index 28c94668ba7c5..48a592f2b54e0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op_xpu.cc
@@ -16,11 +16,13 @@ limitations under the License. */
 
 #include <paddle/fluid/memory/allocation/allocator.h>
 #include <stdio.h>
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/mixed_vector.h"
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -91,37 +93,25 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
                          index_sort.data<int>(), scores_sel.data<T>(),
                          {static_cast<int>(scores.numel()), 1},
                          index_sort.numel(), 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
 
   r = xpu::gather<T>(dev_ctx.x_context(), bbox_deltas.data<T>(),
                      index_sort.data<int>(), bbox_sel.data<T>(),
                      {static_cast<int>(bbox_deltas.numel()) / 4, 4},
                      index_sort.numel(), 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
 
   r = xpu::gather<T>(dev_ctx.x_context(), anchors.data<T>(),
                      index_sort.data<int>(), anchor_sel.data<T>(),
                      {static_cast<int>(anchors.numel()) / 4, 4},
                      index_sort.numel(), 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
 
   r = xpu::gather<T>(dev_ctx.x_context(), variances.data<T>(),
                      index_sort.data<int>(), var_sel.data<T>(),
                      {static_cast<int>(variances.numel()) / 4, 4},
                      index_sort.numel(), 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
 
   int num = scores.numel();
   int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
@@ -137,10 +127,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
                           var_sel.data<T>(), bbox_sel.data<T>(),
                           proposals.data<T>(), pre_nms_num, !pixel_offset, true,
                           im_shape.data<T>());
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(box_decoder) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "box_decoder");
 
   // 3. filter
   Tensor keep_index, keep_num_t;
@@ -151,10 +138,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
                                  im_shape.data<T>(), keep_index.data<int>(),
                                  keep_num_t.data<int>(), pre_nms_num, min_size,
                                  false, pixel_offset);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External(
-                                        "XPU API(remove_small_boxes) return "
-                                        "wrong value[%d %s]",
-                                        r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "remove_small_boxes");
   int keep_num;
   const auto xpu_place = dev_ctx.GetPlace();
   memory::Copy(platform::CPUPlace(), &keep_num, xpu_place,
@@ -176,18 +160,12 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   r = xpu::gather<T>(dev_ctx.x_context(), proposals.data<T>(),
                      keep_index.data<int>(), proposals_filter.data<T>(),
                      {pre_nms_num, 4}, keep_num, 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
 
   r = xpu::gather<T>(dev_ctx.x_context(), scores_sel.data<T>(),
                      keep_index.data<int>(), scores_filter.data<T>(),
                      {pre_nms_num, 1}, keep_num, 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
 
   if (nms_thresh <= 0) {
     if (dev_ctx.x_context()->xpu_stream) {
@@ -201,10 +179,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   r = xpu::nms<T>(dev_ctx.x_context(), proposals_filter.data<T>(), nullptr,
                   keep_index.data<int>(), 1, 1, keep_num, -1, nms_thresh, -1, 0,
                   &nms_keep_num, pixel_offset);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(nms) return the"
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "nms");
   if (post_nms_top_n > 0 && post_nms_top_n < nms_keep_num) {
     keep_index.Resize({post_nms_top_n});
   } else {
@@ -217,17 +192,11 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   r = xpu::gather<T>(dev_ctx.x_context(), proposals_filter.data<T>(),
                      keep_index.data<int>(), proposals_nms.data<T>(),
                      {keep_num, 4}, keep_index.numel(), 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
   r = xpu::gather<T>(dev_ctx.x_context(), scores_filter.data<T>(),
                      keep_index.data<int>(), scores_nms.data<T>(),
                      {keep_num, 1}, keep_index.numel(), 0);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                    platform::errors::External("XPU API(gather) return "
-                                               "wrong value[%d %s]",
-                                               r, XPUAPIErrorMsg[r]));
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather");
   if (dev_ctx.x_context()->xpu_stream) {
     dev_ctx.Wait();
   }
@@ -286,17 +255,11 @@ class XPUGenerateProposalsV2Kernel : public framework::OpKernel<T> {
     int r = xpu::transpose<T>(dev_ctx.x_context(), bbox_deltas->data<T>(),
                               bbox_deltas_swap.data<T>(),
                               {num, c_bbox, h_bbox, w_bbox}, axis);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External("XPU API(transpose) return "
-                                                 "wrong value[%d %s]",
-                                                 r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
     r = xpu::transpose<T>(dev_ctx.x_context(), scores->data<T>(),
                           scores_swap.data<T>(),
                           {num, c_score, h_score, w_score}, axis);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External("XPU API(transpose) return "
-                                                 "wrong value[%d %s]",
-                                                 r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose");
 
     anchors.Resize({anchors.numel() / 4, 4});
     variances.Resize({variances.numel() / 4, 4});
diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 6db3efa3cdd60..5eede02567b43 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,10 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/dropout_impl_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
@@ -142,15 +141,154 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
   }
 }
 
+template <typename T1, typename T2 = T1, typename OutT = T1>
+struct MaskFunctor {
+  const float retain_prob_;
+  using MT = typename details::MPTypeTrait<T1>::Type;
+  MT factor;
+  HOSTDEVICE inline MaskFunctor(const float retain_prob)
+      : retain_prob_(retain_prob) {
+    factor = static_cast<MT>(1.0f / retain_prob_);
+  }
+
+  HOSTDEVICE inline void operator()(OutT* dst, const T2* rand, int num) const {
+    static constexpr int kCount =
+        phi::funcs::uniform_distribution<T2>::kReturnsCount;
+// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
+#pragma unroll
+    for (int i = 0; i < kCount; i++) {
+      if (rand[i] < retain_prob_) {
+        dst[i] = static_cast<T1>(1);
+      } else {
+        dst[i] = static_cast<T1>(0);
+      }
+    }
+  }
+};
+
+template <typename T, typename MaskType>
+struct DstFunctor {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  MT factor;
+  HOSTDEVICE inline DstFunctor(const float retain_prob,
+                               const bool is_upscale_in_train,
+                               const int64_t num)
+      : retain_prob_(retain_prob),
+        is_upscale_in_train_(is_upscale_in_train),
+        num_(num) {
+    factor = static_cast<MT>(1.0f / retain_prob_);
+  }
+
+  HOSTDEVICE inline T operator()(const T src_val, const MaskType mask) const {
+    for (int i = 0; i < num_; i++) {
+      if (mask == static_cast<MaskType>(1)) {
+        return is_upscale_in_train_
+                   ? static_cast<T>(static_cast<MT>(src_val) * factor)
+                   : static_cast<T>(src_val);
+      } else {
+        return static_cast<T>(0);
+      }
+    }
+  }
+
+ private:
+  const float retain_prob_;
+  const bool is_upscale_in_train_;
+  const int64_t num_;
+};
+
+template <typename T, typename MaskType>
+__global__ void VectorizedGeneratorMask(const size_t n, uint64_t seed,
+                                        const float dropout_prob, const T* src,
+                                        MaskType* mask, uint64_t increment,
+                                        size_t main_offset) {
+  constexpr int kCount = phi::funcs::uniform_distribution<float>::kReturnsCount;
+  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
+#ifdef PADDLE_WITH_HIP
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = hiprandStatePhilox4_32_10_t;
+#else
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = curandStatePhilox4_32_10_t;
+#endif
+  T dst_mask[kCount];  // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
+  float rands[kCount];
+  MaskType mask_result[kCount];
+  using Rand = phi::funcs::uniform_distribution<float>;
+  using Cast = kps::IdentityFunctor<T>;
+  int deal_size = BLOCK_NUM_X * kCount;
+
+  size_t fix = idx * kCount;
+
+  auto mask_functor = MaskFunctor<T, float>(1.0f - dropout_prob);
+  for (; fix < main_offset; fix += stride) {
+    kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
+        &dst_mask[0], &rands[0], mask_functor, kCount);
+
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[0], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
+                                                  deal_size);
+    if (fix > idx * kCount + 1) {
+      __syncthreads();
+    }
+  }
+  int remainder = n - fix;
+  if (remainder > 0) {
+    kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
+    kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
+                                                          &state);
+    // dst
+    kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
+        &dst_mask[0], &rands[0], mask_functor, kCount);
+    // mask
+    kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
+        &mask_result[0], &dst_mask[0], Cast());
+    kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
+                                                 remainder);
+    __syncthreads();
+  }
+}
+
+inline void CalcBroadcastedMask(const phi::GPUContext& dev_ctx,
+                                const framework::Tensor& mask,
+                                framework::Tensor* broadcasted_mask) {
+  // The broadcast of mask can be combined to the following ElementwiseKernel
+  // when the BroadcastKernel supports different input types.
+  broadcasted_mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+
+  std::vector<const framework::Tensor*> ins = {&mask};
+  std::vector<framework::Tensor*> outs = {broadcasted_mask};
+  phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary, uint8_t, uint8_t>(
+      dev_ctx, ins, &outs, -1, kps::IdentityFunctor<uint8_t>());
+}
+
+template <typename T, typename MT>
+void ScaleByDropoutFactor(const phi::GPUContext& dev_ctx,
+                          const framework::Tensor& x, framework::Tensor* y,
+                          MT factor) {
+  std::vector<const framework::Tensor*> ins = {&x};
+  std::vector<framework::Tensor*> outs = {y};
+  auto functor = phi::funcs::ScaleFunctor<T>(factor);
+  phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
 template <typename T>
 void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
-                              const std::string dropout_implementation,
                               float dropout_prob, bool upscale_in_train,
                               bool is_fix_seed, int seed_val,
                               const framework::Tensor& x,
                               const framework::Tensor* seed,
-                              framework::Tensor* mask, framework::Tensor* y) {
-  auto& place = *dev_ctx.eigen_device();
+                              framework::Tensor* mask, framework::Tensor* y,
+                              bool is_dropout_nd = false) {
   int64_t x_numel = x.numel();
   auto stream = dev_ctx.stream();
   auto* x_data = x.data<T>();
@@ -198,33 +336,38 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     size_t main_offset =
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
+    if (is_dropout_nd) {
+      VectorizedGeneratorMask<T, uint8_t><<<grid_size, block_size, 0, stream>>>(
+          size, seed_data, dropout_prob, x_data, mask_data, increment,
+          main_offset);
+
+      framework::Tensor broadcasted_mask;
+      broadcasted_mask.Resize(x.dims());
+      CalcBroadcastedMask(dev_ctx, *mask, &broadcasted_mask);
+
+      auto dst_functor = DstFunctor<T, uint8_t>(1.0f - dropout_prob,
+                                                upscale_in_train, x_numel);
+      std::vector<const framework::Tensor*> ins = {&x, &broadcasted_mask};
+      std::vector<framework::Tensor*> outs = {y};
+      phi::funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, dst_functor);
+    } else {
 #define PD_DROPOUT_KERNEL_NAME VectorizedRandomGenerator<T, uint8_t>
-    PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(
-        !is_fix_seed, PD_DROPOUT_KERNEL_NAME, grid_size, block_size, 0, stream,
-        offset, KERNEL_PARAMS.As<uint64_t>(1), KERNEL_PARAMS.As<uint64_t>(7),
-        size, seed_data, dropout_prob, x_data, mask_data, y_data,
-        upscale_in_train, increment, main_offset);
+      PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(
+          !is_fix_seed, PD_DROPOUT_KERNEL_NAME, grid_size, block_size, 0,
+          stream, offset, KERNEL_PARAMS.As<uint64_t>(1),
+          KERNEL_PARAMS.As<uint64_t>(7), size, seed_data, dropout_prob, x_data,
+          mask_data, y_data, upscale_in_train, increment, main_offset);
 #undef PD_DROPOUT_KERNEL_NAME
+    }
   } else {
     if (upscale_in_train) {
-// todo: can y share with data with x directly?
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
-                         hipMemcpyDeviceToDevice, stream));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
-                          cudaMemcpyDeviceToDevice, stream));
-#endif
+      // y = x
+      framework::TensorCopy(x, dev_ctx.GetPlace(), dev_ctx, y);
     } else {
       using MT = typename details::MPTypeTrait<T>::Type;
       MT factor = static_cast<MT>(1.0f - dropout_prob);
-      std::vector<const framework::Tensor*> ins = {&x};
-      std::vector<framework::Tensor*> outs = {y};
-      auto functor = phi::funcs::ScaleFunctor<T>(factor);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                                &outs, functor);
+      // y = factor * x
+      ScaleByDropoutFactor<T, MT>(dev_ctx, x, y, factor);
     }
   }
 }
@@ -246,45 +389,44 @@ struct CudaDropoutGradFunctor {
 };
 
 template <typename T>
-void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
-                                const std::string dropout_implementation,
-                                float dropout_prob,
+void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
+                                float dropout_prob, bool upscale_in_train,
                                 const framework::Tensor& grad_y,
-                                const framework::Tensor& mask, int64_t size,
+                                const framework::Tensor& mask,
                                 framework::Tensor* grad_x,
-                                bool is_test = false) {
+                                bool is_dropout_nd = false) {
   using MT = typename details::MPTypeTrait<T>::Type;
+
   auto stream = dev_ctx.stream();
-  MT factor;
   if (is_test) {
-    if (dropout_implementation == "upscale_in_train") {
-      factor = static_cast<MT>(1.0f);
-    } else {
-      factor = static_cast<MT>(1.0f - dropout_prob);
-    }
-    std::vector<const framework::Tensor*> ins = {&grad_y};
-    std::vector<framework::Tensor*> outs = {grad_x};
-    auto functor = phi::funcs::ScaleFunctor<T>(factor);
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
+    MT factor = static_cast<MT>(upscale_in_train ? 1.0f : 1.0f - dropout_prob);
+    // y = factor * x
+    ScaleByDropoutFactor<T, MT>(dev_ctx, grad_y, grad_x, factor);
   } else {
-    std::vector<const framework::Tensor*> ins = {&grad_y, &mask};
+    framework::Tensor broadcasted_mask;
+    if (is_dropout_nd) {
+      broadcasted_mask.Resize(grad_y.dims());
+      CalcBroadcastedMask(dev_ctx, mask, &broadcasted_mask);
+    }
+
+    std::vector<const framework::Tensor*> ins = {
+        &grad_y, is_dropout_nd ? &broadcasted_mask : &mask};
     std::vector<framework::Tensor*> outs = {grad_x};
-    if (dropout_implementation == "upscale_in_train") {
+    if (upscale_in_train) {
       if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
-        hipMemset(grad_x->data<T>(), 0, size * sizeof(T));
+        hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
-        cudaMemset(grad_x->data<T>(), 0, size * sizeof(T));
+        cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif
       } else {
-        factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
-        paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+        MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
+        phi::funcs::ElementwiseKernel<T>(
             dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
       }
     } else {
-      factor = static_cast<MT>(1.0f);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
+      MT factor = static_cast<MT>(1.0f);
+      phi::funcs::ElementwiseKernel<T>(
           dev_ctx, ins, &outs, CudaDropoutGradFunctor<T, uint8_t>(factor));
     }
   }
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 9426efa494208..3f65a6bfda97f 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -161,15 +161,49 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+class DropoutNdOpMaker : public DropoutOpMaker {
+ public:
+  void Make() override {
+    DropoutOpMaker::Make();
+    AddAttr<std::vector<int>>("axis",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to be dropout_nd.")
+        .SetDefault({});
+  }
+};
+
+template <typename T>
+class DropoutNdGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("dropout_nd_grad");
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput("Mask", this->Output("Mask"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor,
                             PD_INFER_META(phi::DropoutInferMeta));
-
 REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
                   ops::DropoutGradOpMaker<paddle::framework::OpDesc>,
                   ops::DropoutGradOpMaker<paddle::imperative::OpBase>,
                   DropoutInferShapeFunctor);
 REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
+
+DECLARE_INFER_SHAPE_FUNCTOR(dropout_nd, DropoutNdInferShapeFunctor,
+                            PD_INFER_META(phi::DropoutNdInferMeta));
+REGISTER_OPERATOR(dropout_nd, ops::DropoutOp, ops::DropoutNdOpMaker,
+                  ops::DropoutNdGradOpMaker<paddle::framework::OpDesc>,
+                  ops::DropoutNdGradOpMaker<paddle::imperative::OpBase>,
+                  DropoutNdInferShapeFunctor);
+REGISTER_OPERATOR(dropout_nd_grad, ops::DropoutOpGrad);
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index ce95b0a320c66..ef00a0203c7b8 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -131,8 +131,7 @@ class FMHARef {
       auto functor = phi::funcs::ScaleFunctor<T>(alpha);
       std::vector<const framework::Tensor*> ins = {&q_tensor};
       std::vector<framework::Tensor*> outs = {&q_tensor};
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx_, ins,
-                                                                &outs, functor);
+      phi::funcs::ElementwiseKernel<T>(dev_ctx_, ins, &outs, functor);
     }
 
     // q*k^t, batched_gemm
@@ -186,13 +185,11 @@ class FMHARef {
     if (dropout_param_.dropout_prob_) {
       DropoutFwGPUKernelDriver<T>(
           static_cast<const phi::GPUContext&>(dev_ctx_),
-          dropout_param_.is_test_,
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
-          dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
-          dropout_param_.is_fix_seed_, dropout_param_.seed_val_,
+          dropout_param_.is_test_, dropout_param_.dropout_prob_,
+          dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_,
+          dropout_param_.seed_val_,
           static_cast<const Tensor&>(*softmax_out_tensor), dropout_param_.seed_,
-          dropout_mask_out_tensor, dropout_out_tensor);
+          dropout_mask_out_tensor, dropout_out_tensor, false);
       blas.BatchedGEMM(transA, transB, gemm_m, gemm_n, gemm_k, alpha,
                        dropout_out_data, v_ptr, beta, qktv_out_data,
                        gemm_batch_size, stride_a, stride_b);
@@ -288,13 +285,10 @@ class FMHARef {
     // dropout bw
     if (dropout_param_.dropout_prob_) {
       DropoutGradGPUKernelDriver<T>(
-          static_cast<const phi::GPUContext&>(dev_ctx_),
-          static_cast<const std::string>(
-              dropout_param_.dropout_implementation_),
-          dropout_param_.dropout_prob_,
+          static_cast<const phi::GPUContext&>(dev_ctx_), false,
+          dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_,
           static_cast<const Tensor&>(*dropout_out_grad_tensor),
-          dropout_mask_out_tensor, softmax_out_grad_tensor->numel(),
-          softmax_out_grad_tensor);
+          dropout_mask_out_tensor, softmax_out_grad_tensor, false);
     }
 
     if (src_mask_tensor != nullptr) {
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index fc044e0bafa31..8c551db1f8bca 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -541,7 +541,7 @@ void LaunchLayernormResidualDropoutGrad(
   if (!is_upscale_in_train) {
     factor = static_cast<T>(1.0f);
   }
-  ln_bwd_1024_kernel_driver<
+  ln_bwd_fast_kernel_driver<
       T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>, MaskType>(
       dev_ctx, rows, cols, epsilon, layernorm_src, scale, mean, var, d_out,
       d_residual, d_scale, d_layernorm_bias, mask_data, factor, d_dropout_src);
diff --git a/paddle/fluid/operators/gather_nd_op_mlu.cc b/paddle/fluid/operators/gather_nd_op_mlu.cc
new file mode 100644
index 0000000000000..c7d39b927f305
--- /dev/null
+++ b/paddle/fluid/operators/gather_nd_op_mlu.cc
@@ -0,0 +1,123 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class GatherNdMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    out->template mutable_data<T>(place);
+
+    if (x->numel() == 0) return;
+    if (index->numel() == 0) {
+      auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+      framework::TensorCopy(*x, place, dev_ctx, out);
+      return;
+    }
+
+    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Index holds the wrong type, it holds [%s],"
+                          "but desires to be [%s] or [%s]",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc index_desc(*index);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::GatherNd(ctx, x_desc.get(), GetBasePtr(x), index_desc.get(),
+                      GetBasePtr(index), out_desc.get(), GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class GatherNdGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *x = ctx.Input<Tensor>("X");
+
+    if (dx->numel() == 0) return;
+    if (index->numel() == 0) {
+      auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+      framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
+      return;
+    }
+
+    framework::Tensor tmp_tensor(index->type());
+    framework::Tensor tmp_tensor2(dout->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {1, index_dims[0]};
+      tmp_tensor.Resize(phi::make_ddim(new_dim));
+      index = &tmp_tensor;
+
+      tmp_tensor2.ShareDataWith(*dout);
+      std::vector<int64_t> new_dim2{1};
+      for (int i = index->numel(); i < x->dims().size(); i++) {
+        new_dim2.push_back(x->dims()[i]);
+      }
+      tmp_tensor2.Resize(phi::make_ddim(new_dim2));
+      dout = &tmp_tensor2;
+    }
+
+    dx->mutable_data<T>(ctx.GetPlace());
+    MLUCnnlTensorDesc dx_desc(*dx);
+    auto value = static_cast<T>(0);
+    MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(),
+                  GetBasePtr(dx));
+
+    MLUCnnlTensorDesc index_desc(*index);
+    MLUCnnlTensorDesc dout_desc(*dout);
+
+    const cnnlScatterNdMode_t mode = CNNL_SCATTERND_ADD;
+    MLUCnnl::ScatterNd(ctx, mode, index_desc.get(), GetBasePtr(index),
+                       dout_desc.get(), GetBasePtr(dout), dx_desc.get(),
+                       GetBasePtr(dx), dx_desc.get(), GetBasePtr(dx));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_MLU_KERNEL(gather_nd, ops::GatherNdMLUKernel<float>,
+                       ops::GatherNdMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(gather_nd_grad,
+                       ops::GatherNdGradMLUKernel<paddle::platform::float16>,
+                       ops::GatherNdGradMLUKernel<float>);
diff --git a/paddle/fluid/operators/ipu/CMakeLists.txt b/paddle/fluid/operators/ipu/CMakeLists.txt
index 66373d4b5f6b9..3e330ea10deef 100644
--- a/paddle/fluid/operators/ipu/CMakeLists.txt
+++ b/paddle/fluid/operators/ipu/CMakeLists.txt
@@ -1,3 +1,3 @@
 if(WITH_IPU)
   op_library(ipu_runtime_op DEPS ipu_backend)
-endif(WITH_IPU)
+endif()
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index ac20a5962f394..3519a07539182 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -22,6 +22,8 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include <iostream>
+
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
@@ -428,7 +430,7 @@ template <
     int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW, int ROWS_PER_CTA = WARPS_M,
     int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
     int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
-__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
+__global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_fast_kernel(
     const int rows, float epsilon, const T *__restrict__ x_ptr,
     const ScaleT *__restrict__ gamma_ptr, const U *__restrict__ mean_ptr,
     const U *__restrict__ var_ptr, const T *__restrict__ dout_ptr,
@@ -671,7 +673,7 @@ template <
     int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
     int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA,
     int VEC_COLS = ELTS_PER_ROW / VecSize>
-__global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
+__global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_fast_final_kernel(
     const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_,
     ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) {
   using Vec = phi::AlignedVector<U, VecSize>;
@@ -795,7 +797,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel(
  */
 template <typename T, typename U, typename ScaleT = U,
           typename MaskType = uint8_t>
-void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
+void ln_bwd_fast_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
                                const int cols, float epsilon, const T *x_ptr,
                                const ScaleT *scale_ptr, const U *mean_ptr,
                                const U *var_ptr, const T *dout_ptr, T *dx_ptr,
@@ -804,10 +806,10 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
                                T factor = static_cast<T>(0),
                                T *d_dropout_src_ptr = nullptr) {
   auto stream = dev_ctx.stream();
-  if (cols == 1024) {
+  if (cols == 1024 || cols == 384 || cols == 256) {
     // step-1: compute dx and reduced part results of dscale and dbias.
-    const int WARPS_M = 4;
-    const int WARPS_N = 1;
+    const int WARPS_M = 4;  // how many rows delt in a cta.
+    const int WARPS_N = 1;  // how many warps to deal with a row.
     const int BYTES_PER_LDG = 16;
     const int VecSize = BYTES_PER_LDG / sizeof(T);
 
@@ -839,20 +841,52 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
             "To compute fused_dropout_residual_ln grad, d_dropout_src_ptr "
             "can't be null"));
       }
-      fused_ln_bwd_1024_kernel<true, T, U, ScaleT, MaskType, VecSize, WARPS_M,
-                               WARPS_N, BYTES_PER_LDG>
-          <<<gridx, THREADS_PER_CTA, 0, stream>>>(
-              rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
-              dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,
-              d_dropout_src_ptr);
+#define LAUNCH_MASK_FUSED_LN_BWD_FAST_KERNEL(vec_size, ele_per_row)         \
+  fused_ln_bwd_fast_kernel<true, T, U, ScaleT, MaskType, vec_size, WARPS_M, \
+                           WARPS_N, BYTES_PER_LDG, ele_per_row>             \
+      <<<gridx, THREADS_PER_CTA, 0, stream>>>(                              \
+          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,     \
+          dscale_temp_ptr, dbias_temp_ptr, dx_ptr, mask_ptr, factor,        \
+          d_dropout_src_ptr);
+
+      if (cols == 1024) {
+        LAUNCH_MASK_FUSED_LN_BWD_FAST_KERNEL(VecSize, 1024);
+      } else {
+        switch (cols) {
+          case 384:
+            LAUNCH_MASK_FUSED_LN_BWD_FAST_KERNEL(1, 384);
+            break;
+          case 256:
+            LAUNCH_MASK_FUSED_LN_BWD_FAST_KERNEL(VecSize, 256);
+            break;
+        }
+      }
+#undef LAUNCH_MASK_FUSED_LN_BWD_FAST_KERNEL
 
     } else {
-      fused_ln_bwd_1024_kernel<false, T, U, ScaleT, MaskType, VecSize, WARPS_M,
-                               WARPS_N, BYTES_PER_LDG>
-          <<<gridx, THREADS_PER_CTA, 0, stream>>>(
-              rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,
-              dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
+#define LAUNCH_FUSED_LN_BWD_FAST_KERNEL(vec_size, ele_per_row)               \
+  fused_ln_bwd_fast_kernel<false, T, U, ScaleT, MaskType, vec_size, WARPS_M, \
+                           WARPS_N, BYTES_PER_LDG, ele_per_row>              \
+      <<<gridx, THREADS_PER_CTA, 0, stream>>>(                               \
+          rows, epsilon, x_ptr, scale_ptr, mean_ptr, var_ptr, dout_ptr,      \
+          dscale_temp_ptr, dbias_temp_ptr, dx_ptr);
+
+      if (cols == 1024) {
+        LAUNCH_FUSED_LN_BWD_FAST_KERNEL(VecSize, 1024);
+      } else {
+        switch (cols) {
+          case 384:
+            LAUNCH_FUSED_LN_BWD_FAST_KERNEL(1, 384);
+            break;
+          case 256:
+            LAUNCH_FUSED_LN_BWD_FAST_KERNEL(VecSize, 256);
+            break;
+        }
+      }
+
+#undef LAUNCH_FUSED_LN_BWD_FAST_KERNEL
     }
+
     const int WARPS_M_2 = 16;
     const int WARPS_N_2 = 1;
     const int BYTES_PER_LDG_2 = 4;
@@ -865,18 +899,36 @@ void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows,
         WARPS_M_2 * THREADS_PER_ROW_2;     // 16 * 32 = 512
     const int ROWS_PER_CTA_2 = WARPS_M_2;  // 16
 
-    const int gridx_2 = static_cast<int>(
-        std::ceil(1024 / static_cast<float>(THREADS_PER_ROW_2 * VecSize_2)));
     // #blocks: 32，#threads_per_block: 512
     // Note: it is not supported for double type.
     if (sizeof(U) > 4) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Only support float and fp16 type"));
     } else {
-      ln_bwd_1024_final_kernel<U, ScaleT, VecSize_2, WARPS_M_2, WARPS_N_2,
-                               BYTES_PER_LDG_2>
-          <<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(
-              gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
+      int gridx_2 = 0;
+
+#define LAUNCH_LN_BWD_BETA_GAMMMA_KERNEL(vec_size, ele_per_row)         \
+  gridx_2 = static_cast<int>(std::ceil(                                 \
+      ele_per_row / static_cast<float>(THREADS_PER_ROW_2 * vec_size))); \
+  ln_bwd_fast_final_kernel<U, ScaleT, vec_size, WARPS_M_2, WARPS_N_2,   \
+                           BYTES_PER_LDG_2, ele_per_row>                \
+      <<<gridx_2, THREADS_PER_CTA_2, 0, stream>>>(                      \
+          gridx, dscale_temp_ptr, dbias_temp_ptr, dscale_ptr, dbias_ptr);
+
+      if (cols == 1024) {
+        LAUNCH_LN_BWD_BETA_GAMMMA_KERNEL(VecSize_2, 1024);
+      } else {
+        switch (cols) {
+          case 384:
+            LAUNCH_LN_BWD_BETA_GAMMMA_KERNEL(1, 384);
+            break;
+          case 256:
+            LAUNCH_LN_BWD_BETA_GAMMMA_KERNEL(VecSize_2, 256);
+            break;
+        }
+      }
+
+#undef LAUNCH_LN_BWD_BETA_GAMMMA_KERNEL
     }
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -1484,15 +1536,17 @@ static void LayerNormBackward(
     case 7:  // d_x != nullptr, d_scale != nullptr, d_bias != nullptr
     {
 #ifdef PADDLE_WITH_CUDA
-      bool can_call_1024_kernel = false;
+      bool can_call_fast_kernel = false;
       // todo: rule out double type.
-      if (feature_size == 1024 && sizeof(T) <= 4) {
-        can_call_1024_kernel = true;
+      if ((feature_size == 1024 || feature_size == 384 ||
+           feature_size == 256) &&
+          sizeof(T) <= 4) {
+        can_call_fast_kernel = true;
       }
-      VLOG(6) << "can_call_1024_kernel = " << can_call_1024_kernel;
 
-      if (can_call_1024_kernel) {
-        ln_bwd_1024_kernel_driver<
+      VLOG(6) << "can_call_fast_kernel = " << can_call_fast_kernel;
+      if (can_call_fast_kernel) {
+        ln_bwd_fast_kernel_driver<
             T, U, LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>>(
             dev_ctx, batch_size, feature_size, epsilon, x, scale, mean, var,
             d_y, d_x, d_scale, d_bias);
diff --git a/paddle/fluid/operators/meshgrid_op_mlu.cc b/paddle/fluid/operators/meshgrid_op_mlu.cc
new file mode 100644
index 0000000000000..e45f0be958b7e
--- /dev/null
+++ b/paddle/fluid/operators/meshgrid_op_mlu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MeshgridMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    PADDLE_ENFORCE_EQ(
+        (ins.size() > 1) && (ins.size() < 7), true,
+        platform::errors::InvalidArgument(
+            "Excepted Tensor numbers between 2 and 6, but only received d% .",
+            ins.size()));
+
+    int64_t size = ins.size();
+    std::vector<int64_t> shape(size);
+
+    for (int64_t i = 0; i < size; i++) {
+      switch (ins[i]->dims().size()) {
+        case 0:
+          shape[i] = 1;
+          break;
+        case 1:
+          shape[i] = ins[i]->dims()[0];
+          break;
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Expected scalar or 1D tensor in the tensor list but got tensor "
+              "%d: ",
+              i));
+      }
+    }
+
+    MLUCnnlTensorDesc out_desc(size, shape.data(), ToCnnlDataType<T>());
+    framework::DDim out_dims = phi::make_ddim(shape);
+    for (int64_t i = 0; i < size; i++) {
+      std::vector<int64_t> view_shape(size, 1);
+      view_shape[i] = shape[i];
+
+      outs[i]->Resize(out_dims);
+      outs[i]->mutable_data<T>(ctx.GetPlace());
+
+      MLUCnnlTensorDesc in_desc(size, view_shape.data(), ToCnnlDataType<T>());
+      MLUCnnl::BroadcastTo(ctx, in_desc.get(), GetBasePtr(ins[i]),
+                           out_desc.get(), GetBasePtr(outs[i]));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_MLU_KERNEL(
+    meshgrid, paddle::operators::MeshgridMLUKernel<int>,
+    paddle::operators::MeshgridMLUKernel<float>,
+    paddle::operators::MeshgridMLUKernel<int64_t>,
+    paddle::operators::MeshgridMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 5b452ca3ba2ea..dc8301b9e0b8d 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -1942,6 +1942,25 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                               cast_type, output_desc, output));
 }
 
+/*static*/ void MLUCnnl::Clip(const ExecutionContext& ctx,
+                              const cnnlTensorDescriptor_t x_desc,
+                              const void* x, const void* min, const void* max,
+                              void* y) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlClip(handle, x_desc, x, min, max, y));
+}
+
+/*static*/ void MLUCnnl::HardtanhBackward(
+    const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc,
+    const void* x, const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y,
+    const float max_val, const float min_val,
+    const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlHardtanhBackward(handle, x_desc, x, diff_y_desc, diff_y, max_val,
+                           min_val, diff_x_desc, diff_x));
+}
+
 /* static */ void MLUCnnl::PoolingBackward(
     const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc,
     const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index ebb8aae1eb329..774e297c06dd0 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -439,6 +439,16 @@ class MLUCnnl {
                    const cnnlTensorDescriptor_t input_desc, const void* input,
                    const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void Clip(const ExecutionContext& ctx,
+                   const cnnlTensorDescriptor_t input_desc, const void* input,
+                   const void* min, const void* max, void* y);
+
+  static void HardtanhBackward(
+      const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc,
+      const void* x, const cnnlTensorDescriptor_t diff_y_desc,
+      const void* diff_y, const float max_val, const float min_val,
+      const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
+
   static void Div(const ExecutionContext& ctx,
                   cnnlComputationPreference_t prefer,
                   const cnnlTensorDescriptor_t in0_desc, const void* in0,
@@ -1159,7 +1169,7 @@ class MLUCnnl {
 
   static void ConvBackpropInput(
       const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
-      const cnnlTensorDescriptor_t input_desc, const void* filter,
+      const cnnlTensorDescriptor_t filter_desc, const void* filter,
       const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
       const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop);
 
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 0f9bcc4c2d977..9b8e67eb6fc91 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -147,8 +147,10 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("is_grad_scaled_by_nranks",
                   "Whether the input gradient has been scaled by nranks.")
         .SetDefault(true);
-    AddAttr<int>("ring_id", "The ring id of the NCCL communicator.")
-        .SetDefault(0);
+    AddAttr<int64_t>("nranks", "The world size.").SetDefault(1);
+    AddAttr<std::vector<int>>("ring_id",
+                              "The ring id of the NCCL communicator.")
+        .SetDefault({0});
     AddComment("The DistributedFusedLamb optimizer.");
   }
 };
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index e7f6223968f43..09233ab99574b 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -806,23 +806,24 @@ static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
 #undef PD_LAMB_VEC_SCALE_KERNEL_CASE
 }
 
-template <typename T>
-static void NCCLReduceScatterWithScale(
-    const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks,
-    ncclComm_t comm, gpuStream_t stream,
-    const platform::CUDADeviceContext &dev_ctx, const T *scale = nullptr) {
+template <typename T, bool UseReduceScatter>
+static void NCCLSumWithScaleBase(const T *sendbuff, T *recvbuff,
+                                 size_t recvcount, size_t nranks,
+                                 ncclComm_t comm, gpuStream_t stream,
+                                 const platform::CUDADeviceContext &dev_ctx,
+                                 const T *scale = nullptr) {
   static_assert(std::is_same<T, float>::value ||
                     std::is_same<T, platform::float16>::value,
                 "T must be either float32 or float16.");
   if (recvcount == 0) return;
 
+  auto numel = UseReduceScatter ? (recvcount * nranks) : recvcount;
   if (comm == nullptr) {
     if (scale != nullptr) {
       PADDLE_ENFORCE_EQ(nranks, 1,
                         platform::errors::InvalidArgument(
                             "nranks must be 1 when scale != nullptr."));
-      LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks,
-                        stream);
+      LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, numel, stream);
     }
     return;
   }
@@ -834,14 +835,18 @@ static void NCCLReduceScatterWithScale(
       scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op);
   memory::Buffer buffer(dev_ctx.GetPlace());
   if (scale && !should_destroy_op) {
-    size_t numel = recvcount * nranks;
     T *new_sendbuff = buffer.Alloc<T>(numel);
     LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
     sendbuff = new_sendbuff;
   }
 
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
-      sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+  if (UseReduceScatter) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
+        sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
+  }
 
 #if NCCL_VERSION_CODE >= 21100
   if (should_destroy_op) {
@@ -851,6 +856,26 @@ static void NCCLReduceScatterWithScale(
   }
 #endif
 }
+
+template <typename T>
+static void NCCLReduceScatterWithScale(
+    const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks,
+    ncclComm_t comm, gpuStream_t stream,
+    const platform::CUDADeviceContext &dev_ctx, const T *scale = nullptr) {
+  NCCLSumWithScaleBase<T, true>(sendbuff, recvbuff, recvcount, nranks, comm,
+                                stream, dev_ctx, scale);
+}
+
+template <typename T>
+static void NCCLAllReduceWithScale(const T *sendbuff, T *recvbuff,
+                                   size_t recvcount, size_t nranks,
+                                   ncclComm_t comm, gpuStream_t stream,
+                                   const platform::CUDADeviceContext &dev_ctx,
+                                   const T *scale = nullptr) {
+  NCCLSumWithScaleBase<T, false>(sendbuff, recvbuff, recvcount, nranks, comm,
+                                 stream, dev_ctx, scale);
+}
+
 #endif
 
 template <typename InputIteratorT, typename OutputIteratorT, typename ReduceOpT,
@@ -1321,6 +1346,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "exactly by the element number %d of Moment1.",
                           numel, partial_numel));
 
+    // The num_devices means the number of devices that shard a complete set
+    // of all parameters. It may be num_devices < nranks or num_devices ==
+    // nranks.
     int64_t num_devices = numel / partial_numel;
     VLOG(1) << "num_devices = " << num_devices
             << " , partial_numel = " << partial_numel;
@@ -1354,22 +1382,43 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     auto epsilon = ctx.Attr<float>("epsilon");
     auto max_global_grad_norm = ctx.Attr<float>("max_global_grad_norm");
     auto clip_after_allreduce = ctx.Attr<bool>("clip_after_allreduce");
-    auto ring_id = ctx.Attr<int>("ring_id");
+    auto nranks = ctx.Attr<int64_t>("nranks");
+    PADDLE_ENFORCE_GE(nranks, num_devices,
+                      phi::errors::InvalidArgument(
+                          "The nranks must be not less than num_devices."));
+    PADDLE_ENFORCE_EQ(
+        nranks % num_devices, 0,
+        phi::errors::InvalidArgument(
+            "The nranks must be exactly divided by num_devices."));
+    bool local_shard = (nranks > num_devices);
+
+    const auto &ring_ids = ctx.Attr<std::vector<int>>("ring_id");
     auto use_master_param_norm = ctx.Attr<bool>("use_master_param_norm");
     auto is_grad_scaled_by_nranks = ctx.Attr<bool>("is_grad_scaled_by_nranks");
     VLOG(10) << "max_global_grad_norm = " << max_global_grad_norm
              << " , clip_after_allreduce = " << clip_after_allreduce
              << " , use_master_param_norm = " << use_master_param_norm
-             << " , is_grad_scaled_by_nranks = " << is_grad_scaled_by_nranks;
+             << " , is_grad_scaled_by_nranks = " << is_grad_scaled_by_nranks
+             << " , local_shard = " << local_shard;
 
     // Step 6: allreduce + global norm gradient clip
-    int rank = 0;
-    ncclComm_t comm = nullptr;
-    if (num_devices > 1) {
+    int64_t global_rank = 0, local_rank = 0;
+    ncclComm_t global_comm = nullptr, local_comm = nullptr;
+    if (nranks > 1) {
       auto *nccl_comm_handle =
-          platform::NCCLCommContext::Instance().Get(ring_id, place);
-      comm = nccl_comm_handle->comm();
-      rank = nccl_comm_handle->rank();
+          platform::NCCLCommContext::Instance().Get(ring_ids[0], place);
+      global_comm = nccl_comm_handle->comm();
+      global_rank = nccl_comm_handle->rank();
+
+      if (local_shard) {
+        auto *local_nccl_comm_handle =
+            platform::NCCLCommContext::Instance().Get(ring_ids[1], place);
+        local_comm = local_nccl_comm_handle->comm();
+        local_rank = local_nccl_comm_handle->rank();
+      } else {
+        local_comm = global_comm;
+        local_rank = global_rank;
+      }
     }
 
     memory::Buffer grad_norm_square_buffer(place);
@@ -1381,8 +1430,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     platform::float16 *fp16_sum_grad;
     auto fp32_numel_each_device = fp32_numel / num_devices;
     auto fp16_numel_each_device = fp16_numel / num_devices;
-    if (num_devices > 1 ||
-        (max_global_grad_norm > 0 && !clip_after_allreduce)) {
+    if (local_shard) {
+      auto ptr = sum_grad_buffer.Alloc<uint8_t>(
+          fp32_numel * sizeof(float) + fp16_numel * sizeof(platform::float16));
+      fp32_sum_grad = has_fp32_param ? reinterpret_cast<float *>(ptr) : nullptr;
+      fp16_sum_grad = has_fp16_param ? reinterpret_cast<platform::float16 *>(
+                                           ptr + fp32_numel * sizeof(float))
+                                     : nullptr;
+    } else if (nranks > 1 ||
+               (max_global_grad_norm > 0 && !clip_after_allreduce)) {
       auto ptr = sum_grad_buffer.Alloc<uint8_t>(
           fp32_numel_each_device * sizeof(float) +
           fp16_numel_each_device * sizeof(platform::float16));
@@ -1404,18 +1460,27 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
     float rescale_grad = 1.0f;
     if (!is_grad_scaled_by_nranks) {
-      rescale_grad /= num_devices;
+      rescale_grad /= nranks;
     }
 
     if (max_global_grad_norm > 0) {
       if (clip_after_allreduce) {
         // (1) ReduceScater first
-        NCCLReduceScatterWithScale(fp32_grad, fp32_sum_grad,
-                                   fp32_numel_each_device, num_devices, comm,
-                                   stream, dev_ctx);
-        NCCLReduceScatterWithScale(fp16_grad, fp16_sum_grad,
-                                   fp16_numel_each_device, num_devices, comm,
-                                   stream, dev_ctx);
+        if (local_shard) {
+          NCCLAllReduceWithScale(fp32_grad, fp32_sum_grad, fp32_numel, nranks,
+                                 global_comm, stream, dev_ctx);
+          NCCLAllReduceWithScale(fp16_grad, fp16_sum_grad, fp16_numel, nranks,
+                                 global_comm, stream, dev_ctx);
+          fp32_sum_grad += (local_rank * fp32_numel_each_device);
+          fp16_sum_grad += (local_rank * fp16_numel_each_device);
+        } else {
+          NCCLReduceScatterWithScale(fp32_grad, fp32_sum_grad,
+                                     fp32_numel_each_device, nranks,
+                                     global_comm, stream, dev_ctx);
+          NCCLReduceScatterWithScale(fp16_grad, fp16_sum_grad,
+                                     fp16_numel_each_device, nranks,
+                                     global_comm, stream, dev_ctx);
+        }
         // (2) Calculate the global grad norm
         GetSquareGradNorm(fp32_sum_grad, fp32_numel_each_device, fp16_sum_grad,
                           fp16_numel_each_device, fp32_square_grad_norm, stream,
@@ -1425,7 +1490,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         if (num_devices > 1) {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
               fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
-              ncclSum, comm, stream));
+              ncclSum, local_comm, stream));
         }
         VLOG(1) << "Grad square norm after all reduce: "
                 << FlattenToString(fp32_square_grad_norm, 1, place);
@@ -1452,7 +1517,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
         float clip_scale = 1.0f;
         if (is_grad_scaled_by_nranks) {
-          clip_scale *= num_devices;
+          clip_scale *= nranks;
         }
         CalcGradNormClipBeforeAllReduceScale<float, platform::float16>
             <<<1, 1, 0, stream>>>(global_scale, max_global_grad_norm,
@@ -1463,36 +1528,54 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         } else {
           VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
         }
-        if (num_devices > 1) {
+        if (nranks > 1) {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
               fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
-              ncclSum, comm, stream));
+              ncclSum, global_comm, stream));
         }
         // (3) Do ReduceScatter with scale
-        NCCLReduceScatterWithScale(fp32_grad, fp32_sum_grad,
-                                   fp32_numel_each_device, num_devices, comm,
-                                   stream, dev_ctx, fp32_scale);
-        NCCLReduceScatterWithScale(fp16_grad, fp16_sum_grad,
-                                   fp16_numel_each_device, num_devices, comm,
-                                   stream, dev_ctx, fp16_scale);
+        if (local_shard) {
+          NCCLAllReduceWithScale(fp32_grad, fp32_sum_grad, fp32_numel, nranks,
+                                 global_comm, stream, dev_ctx, fp32_scale);
+          NCCLAllReduceWithScale(fp16_grad, fp16_sum_grad, fp16_numel, nranks,
+                                 global_comm, stream, dev_ctx, fp16_scale);
+          fp32_sum_grad += (local_rank * fp32_numel_each_device);
+          fp16_sum_grad += (local_rank * fp16_numel_each_device);
+        } else {
+          NCCLReduceScatterWithScale(fp32_grad, fp32_sum_grad,
+                                     fp32_numel_each_device, nranks,
+                                     global_comm, stream, dev_ctx, fp32_scale);
+          NCCLReduceScatterWithScale(fp16_grad, fp16_sum_grad,
+                                     fp16_numel_each_device, nranks,
+                                     global_comm, stream, dev_ctx, fp16_scale);
+        }
         // (4) mark max_global_grad_norm as 0, meaning that clip has been
         // already performed
         max_global_grad_norm = 0;
       }
     } else {
-      NCCLReduceScatterWithScale(fp32_grad, fp32_sum_grad,
-                                 fp32_numel_each_device, num_devices, comm,
-                                 stream, dev_ctx);
-      NCCLReduceScatterWithScale(fp16_grad, fp16_sum_grad,
-                                 fp16_numel_each_device, num_devices, comm,
-                                 stream, dev_ctx);
+      if (local_shard) {
+        NCCLAllReduceWithScale(fp32_grad, fp32_sum_grad, fp32_numel, nranks,
+                               global_comm, stream, dev_ctx);
+        NCCLAllReduceWithScale(fp16_grad, fp16_sum_grad, fp16_numel, nranks,
+                               global_comm, stream, dev_ctx);
+        fp32_sum_grad += (local_rank * fp32_numel_each_device);
+        fp16_sum_grad += (local_rank * fp16_numel_each_device);
+      } else {
+        NCCLReduceScatterWithScale(fp32_grad, fp32_sum_grad,
+                                   fp32_numel_each_device, num_devices,
+                                   global_comm, stream, dev_ctx);
+        NCCLReduceScatterWithScale(fp16_grad, fp16_sum_grad,
+                                   fp16_numel_each_device, num_devices,
+                                   global_comm, stream, dev_ctx);
+      }
       CheckHasNanInfGrad(fp32_sum_grad, fp32_numel_each_device, fp16_sum_grad,
                          fp16_numel_each_device, fp32_square_grad_norm, stream,
                          &cub_tmp_buffer);
       if (num_devices > 1) {
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
-            ncclSum, comm, stream));
+            ncclSum, local_comm, stream));
       }
       max_global_grad_norm = 0;
     }
@@ -1526,8 +1609,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
     memory::Buffer trust_ratio_div_buffer(place);
     auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
-    auto fp32_offset = rank * fp32_numel_each_device;
-    auto fp16_offset = rank * fp16_numel_each_device;
+    auto fp32_offset = local_rank * fp32_numel_each_device;
+    auto fp16_offset = local_rank * fp16_numel_each_device;
     if (has_fp32_param) {
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts";
       MultiTensorUpdateLambMomentAndTrustRatioDiv(
@@ -1598,12 +1681,12 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             param_square_norm + fp32_global_param_num,
             param_square_norm + fp32_global_param_num,
-            2 * param_num - fp32_global_param_num, ncclFloat32, ncclSum, comm,
-            stream));
+            2 * param_num - fp32_global_param_num, ncclFloat32, ncclSum,
+            local_comm, stream));
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             trust_ratio_div_square_norm, trust_ratio_div_square_norm, param_num,
-            ncclFloat32, ncclSum, comm, stream));
+            ncclFloat32, ncclSum, local_comm, stream));
       }
       VLOG(10) << "ncclAllReduce done";
     }
@@ -1623,7 +1706,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
             fp32_param + fp32_offset, fp32_param, fp32_numel_each_device,
-            ncclFloat32, comm, stream));
+            ncclFloat32, local_comm, stream));
       }
 
       beta1pow = nullptr;
@@ -1641,7 +1724,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
             fp16_param + fp16_offset, fp16_param, fp16_numel_each_device,
-            ncclFloat16, comm, stream));
+            ncclFloat16, local_comm, stream));
       }
     }
     VLOG(10) << "Update Param done";
diff --git a/paddle/fluid/operators/randperm_op_mlu.cc b/paddle/fluid/operators/randperm_op_mlu.cc
new file mode 100644
index 0000000000000..3c825f7e3aaa7
--- /dev/null
+++ b/paddle/fluid/operators/randperm_op_mlu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/randperm_op.h"
+
+template <typename T>
+using kernel =
+    paddle::operators::RandpermKernel<paddle::platform::MLUDeviceContext, T>;
+
+REGISTER_OP_MLU_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
+                       kernel<double>);
diff --git a/paddle/fluid/operators/range_op_mlu.cc b/paddle/fluid/operators/range_op_mlu.cc
new file mode 100644
index 0000000000000..ceeb0cf5c3687
--- /dev/null
+++ b/paddle/fluid/operators/range_op_mlu.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/range_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class RangeMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(
+        *start_t, platform::CPUPlace(),
+        context.template device_context<platform::MLUDeviceContext>(), &n);
+    context.template device_context<paddle::platform::MLUDeviceContext>()
+        .Wait();
+    T start = n.data<T>()[0];
+    framework::TensorCopy(
+        *end_t, platform::CPUPlace(),
+        context.template device_context<platform::MLUDeviceContext>(), &n);
+    context.template device_context<paddle::platform::MLUDeviceContext>()
+        .Wait();
+    T end = n.data<T>()[0];
+    framework::TensorCopy(
+        *step_t, platform::CPUPlace(),
+        context.template device_context<platform::MLUDeviceContext>(), &n);
+    context.template device_context<paddle::platform::MLUDeviceContext>()
+        .Wait();
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+
+    out->Resize(phi::make_ddim({size}));
+    out->mutable_data<T>(context.GetPlace());
+
+    std::vector<T> odata;
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      odata.push_back(value);
+      value += step;
+    }
+
+    framework::TensorFromVector(odata, context.device_context(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_MLU_KERNEL(range, paddle::operators::RangeMLUKernel<int>,
+                       paddle::operators::RangeMLUKernel<int64_t>,
+                       paddle::operators::RangeMLUKernel<float>,
+                       paddle::operators::RangeMLUKernel<double>)
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
new file mode 100644
index 0000000000000..6cfb946bae397
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_mlu.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    MLUReduceOp<T>(context, "reduce_prod");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_prod, ops::ReduceMeanMLUKernel<float>,
+                       ops::ReduceMeanMLUKernel<plat::float16>,
+                       ops::ReduceMeanMLUKernel<int>);
diff --git a/paddle/fluid/operators/scatter_op_mlu.cc b/paddle/fluid/operators/scatter_op_mlu.cc
new file mode 100644
index 0000000000000..057ba3f4a4f2f
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op_mlu.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ScatterMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* indices = ctx.Input<Tensor>("Ids");
+    auto* updates = ctx.Input<Tensor>("Updates");
+    bool overwrite = ctx.Attr<bool>("overwrite");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc indices_desc(*indices);
+    MLUCnnlTensorDesc updates_desc(*updates);
+    MLUCnnlTensorDesc out_desc(*out);
+    cnnlScatterRefMode_t mode;
+    if (overwrite) {
+      mode = CNNL_SCATTERREF_UPDATE;
+      MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x),
+                              updates_desc.get(), GetBasePtr(updates),
+                              indices_desc.get(), GetBasePtr(indices), mode);
+    } else {
+      Tensor tensor_zeros(updates->type());
+      tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
+      MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
+      float value = 0.0;
+      auto value_t = static_cast<T>(value);
+      MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t,
+                    tensor_zeros_desc.get(), GetBasePtr(&tensor_zeros));
+      mode = CNNL_SCATTERREF_UPDATE;
+      MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x),
+                              tensor_zeros_desc.get(),
+                              GetBasePtr(&tensor_zeros), indices_desc.get(),
+                              GetBasePtr(indices), mode);
+      mode = CNNL_SCATTERREF_ADD;
+      MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x),
+                              updates_desc.get(), GetBasePtr(updates),
+                              indices_desc.get(), GetBasePtr(indices), mode);
+    }
+    paddle::framework::TensorCopy(*x, place, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_MLU_KERNEL(scatter, ops::ScatterMLUKernel<float>,
+                       ops::ScatterMLUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc
index 8f2dfd38d491b..10179e804343a 100644
--- a/paddle/fluid/operators/slice_op_xpu.cc
+++ b/paddle/fluid/operators/slice_op_xpu.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/phi/kernels/funcs/slice_utils.h"
 #include "xpu/refactor/math.h"
 
 namespace paddle {
@@ -26,76 +28,163 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+inline void DealTensorArray(const framework::ExecutionContext& ctx,
+                            const std::vector<int>& starts,
+                            const std::vector<int>& ends, bool out_is_array) {
+  auto in_array = ctx.Input<LoDTensorArray>("Input");
+  // If the input is LoDTensorArray, the rank of input is 1.
+  int in_size = in_array->size();
+  int start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+  int end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+
+  start = std::max(start, static_cast<int>(0));
+  end = std::max(end, static_cast<int>(0));
+  end = std::min(end, in_size);
+
+  if (starts[0] == -1 && end == 0) {
+    end = start + 1;
+  }
+
+  PADDLE_ENFORCE_GT(end, start,
+                    platform::errors::InvalidArgument(
+                        "Attr(ends) should be greater than attr(starts) in "
+                        "slice op. But received end = %d, start = %d.",
+                        ends[0], starts[0]));
+  int out_size = end - start;
+
+  if (out_is_array) {
+    auto out_array = ctx.Output<LoDTensorArray>("Out");
+    out_array->resize(out_size);
+
+    for (int i = 0; i < out_size; ++i) {
+      auto* out_tensor = &out_array->at(i);
+      auto in_tensor = in_array->at(i + start);
+      out_tensor->set_lod(in_tensor.lod());
+      if (in_tensor.memory_size() > 0) {
+        paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
+      } else {
+        VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                    "nothing has been written to output array["
+                 << i << "].";
+      }
+    }
+  } else {
+    auto out = ctx.Output<Tensor>("Out");
+    auto in_tensor = in_array->at(start);
+    paddle::framework::TensorCopy(in_tensor, ctx.GetPlace(), out);
+  }
+}
 template <typename DeviceContext, typename T>
 class SliceXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto in = ctx.Input<framework::Tensor>("Input");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto starts = ctx.Attr<std::vector<int>>("starts");
-    auto ends = ctx.Attr<std::vector<int>>("ends");
-    auto in_dims = in->dims();
-
-    // prepare starts, ends on XPU
-    int dim_value = 0, start = 0, end = 0;
-    // If a negative value is passed for any of the start or end indices,
-    // it represents number of elements before the end of that dimension.
-    // If the value passed to start or end is larger than the n
-    // (the number of elements in this dimension), it represents n.
-    for (size_t i = 0; i < axes.size(); ++i) {
-      dim_value = in_dims[axes[i]];
-      start = starts[i];
-      end = ends[i];
-      start = start < 0 ? (start + dim_value) : start;
-      end = end < 0 ? (end + dim_value) : end;
-      start = std::max(start, 0);
-      end = std::max(end, 0);
-      end = std::min(end, dim_value);
-      PADDLE_ENFORCE_GT(
-          end, start,
-          platform::errors::InvalidArgument("end should greater than start"));
-      starts[i] = start;
-      ends[i] = end;
-    }
-    size_t shape_size = in_dims.size();
-    // the slice XPU kernel require that the length of `start`, `end` must be
-    // equal
-    // to the dims size of input tensor, therefore, if shape_size > axes.size(),
-    // the `starts_extension` and `ends_extension` is necessary.
-    std::vector<int> starts_extension(shape_size, 0);
-    std::vector<int> ends_extension(shape_size, 0);
-    if (shape_size > axes.size()) {
-      for (size_t i = 0; i < shape_size; ++i) {
-        ends_extension[i] = in_dims[i];
-      }
-      for (size_t i = 0; i < axes.size(); ++i) {
-        starts_extension[axes[i]] = starts[i];
-        ends_extension[axes[i]] = ends[i];
-      }
-    } else {
-      starts_extension = std::move(starts);
-      ends_extension = std::move(ends);
+    const Variable* input_var = ctx.InputVar("Input");
+    Variable* out_var = ctx.OutputVar("Out");
+    bool input_is_array = input_var->IsType<LoDTensorArray>();
+    bool out_is_array = out_var->IsType<LoDTensorArray>();
+
+    auto axes_int = ctx.Attr<std::vector<int>>("axes");
+    auto starts_int = ctx.Attr<std::vector<int>>("starts");
+    auto ends_int = ctx.Attr<std::vector<int>>("ends");
+    std::vector<int> axes(axes_int.begin(), axes_int.end());
+    std::vector<int> starts(starts_int.begin(), starts_int.end());
+    std::vector<int> ends(ends_int.begin(), ends_int.end());
+
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
+
+    // Step 1: Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
     }
 
-    // prepare shape on XPU
-    std::vector<int> shape(shape_size, 0);
-    for (size_t i = 0; i < shape_size; ++i) {
-      shape[i] = in_dims[i];
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
     }
 
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    const XPUType* in_data = reinterpret_cast<const XPUType*>(in->data<T>());
-    XPUType* out_data =
-        reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
-    int r = xpu::slice<XPUType>(dev_ctx.x_context(), in_data, out_data, shape,
-                                starts_extension, ends_extension);
     PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU slice kernel return wrong value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    // Step 2: Compute output
+    if (input_is_array) {
+      DealTensorArray(ctx, starts, ends, out_is_array);
+      return;
+    } else {
+      auto in = ctx.Input<framework::Tensor>("Input");
+      auto out = ctx.Output<framework::Tensor>("Out");
+
+      auto in_dims = in->dims();
+      auto out_dims = out->dims();
+      auto slice_dims = out_dims;
+
+      // 2.1 Infer output dims
+      for (size_t i = 0; i < axes.size(); ++i) {
+        // when start == -1 && end == start+1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = in_dims[axes[i]];
+          }
+        }
+      }
+
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+      out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
+
+      out->Resize(out_dims);
+
+      // 2.2 Get output
+      size_t shape_size = in_dims.size();
+      // the slice XPU kernel require that the length of `start`, `end` must be
+      // equal
+      // to the dims size of input tensor, therefore, if shape_size >
+      // axes.size(), the `starts_extension` and `ends_extension` is necessary.
+      std::vector<int> starts_extension(shape_size, 0);
+      std::vector<int> ends_extension(shape_size, 0);
+      if (shape_size > axes.size()) {
+        for (size_t i = 0; i < shape_size; ++i) {
+          ends_extension[i] = in_dims[i];
+        }
+        for (size_t i = 0; i < axes.size(); ++i) {
+          starts_extension[axes[i]] = starts[i];
+          ends_extension[axes[i]] = ends[i];
+        }
+      } else {
+        starts_extension = std::move(starts);
+        ends_extension = std::move(ends);
+      }
+
+      // prepare shape on XPU
+      std::vector<int> shape(shape_size, 0);
+      for (size_t i = 0; i < shape_size; ++i) {
+        shape[i] = in_dims[i];
+      }
+
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      const XPUType* in_data = reinterpret_cast<const XPUType*>(in->data<T>());
+      XPUType* out_data =
+          reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
+      int r = xpu::slice<XPUType>(dev_ctx.x_context(), in_data, out_data, shape,
+                                  starts_extension, ends_extension);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "slice");
+    }
   }
 };
 
@@ -168,10 +257,7 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
         reinterpret_cast<XPUType*>(dinput->mutable_data<T>(ctx.GetPlace()));
     int r = xpu::pad<XPUType>(dev_ctx.x_context(), dout_data, din_data,
                               out_dims, pad_left, pad_right, XPUType(0));
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU pad kernel return wrong value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "pad");
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 247ff43b8a047..e01e2eb599b25 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -2,7 +2,7 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto
               simple_threadpool)
 if(WITH_GPU)
   proto_library(external_error_proto SRCS external_error.proto)
-endif(WITH_GPU)
+endif()
 if(WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E
@@ -19,7 +19,7 @@ if(WITH_PYTHON)
       COMMENT
         "Copy generated python proto into directory paddle/fluid/proto/profiler."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  else(NOT WIN32)
+  else()
     string(REPLACE "/" "\\" proto_dstpath
                    "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
     add_custom_command(
@@ -31,7 +31,7 @@ if(WITH_PYTHON)
       COMMENT
         "Copy generated python proto into directory paddle/fluid/proto/profiler."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-  endif(NOT WIN32)
+  endif()
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h
deleted file mode 100644
index b42ae15405e7f..0000000000000
--- a/paddle/fluid/platform/aligned_vector.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.1 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.1
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace platform {
-
-// Aligned vector generates vectorized load/store on CUDA.
-template <typename T, int Size>
-struct alignas(sizeof(T) * Size) AlignedVector {
-  T val[Size];
-
-  HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
-  HOSTDEVICE inline T& operator[](int i) { return val[i]; }
-};
-
-template <typename T, int Size>
-HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
-  const AlignedVector<T, Size>* addr_vec =
-      reinterpret_cast<const AlignedVector<T, Size>*>(addr);
-  *vec = *addr_vec;
-}
-
-template <typename T, int Size>
-HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
-  AlignedVector<T, Size>* addr_vec =
-      reinterpret_cast<AlignedVector<T, Size>*>(addr);
-  *addr_vec = vec;
-}
-
-/*
- * Only the address of input data is the multiplier of 1,2,4, vectorized load
- * with corresponding multiplier-value is possible. Moreover, the maximum length
- * of vectorized load is 128 bits once. Hence, valid length of vectorized load
- * shall be determined under both former constraints.
- */
-template <typename T>
-int GetVectorizedSize(const T* pointer) {
-  constexpr int max_load_bits = 128;
-  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
-  uint64_t address = reinterpret_cast<uint64_t>(pointer);
-  constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value;  // NOLINT
-  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
-  constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
-  if (address % vec8 == 0) {
-    /*
-     * Currently, decide to deal with no more than 4 data once while adopting
-     * vectorization load/store, if performance test shows that dealing with
-     * 8 data once in vectorization load/store does get optimized, return code
-     * below can be changed into " return std::min(8, valid_vec_size); " .
-     */
-    return std::min(4, valid_vec_size);
-  } else if (address % vec4 == 0) {
-    return std::min(4, valid_vec_size);
-  } else if (address % vec2 == 0) {
-    return std::min(2, valid_vec_size);
-  } else {
-    return 1;
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
index 1d5fe8c329f11..eeabd835ef348 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -27,26 +27,98 @@ Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
   return new_node;
 }
 
-Node *relu_handler(Graph *graph, Node *node) {
-  return activation_op_handler(graph, node, "popart_relu");
+Node *abs_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_abs");
 }
 
-Node *tanh_handler(Graph *graph, Node *node) {
-  return activation_op_handler(graph, node, "popart_tanh");
+Node *acos_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_acos");
+}
+
+Node *asin_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_asin");
+}
+
+Node *atan_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_atan");
+}
+
+Node *ceil_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_ceil");
+}
+
+Node *cos_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_cos");
+}
+
+Node *cosh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_cosh");
+}
+
+Node *erf_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_erf");
+}
+
+Node *exp_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_exp");
+}
+
+Node *floor_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_floor");
 }
 
 Node *log_handler(Graph *graph, Node *node) {
   return activation_op_handler(graph, node, "popart_log");
 }
 
+Node *reciprocal_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_reciprocal");
+}
+
+Node *relu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_relu");
+}
+
+Node *round_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_round");
+}
+
 Node *sigmoid_handler(Graph *graph, Node *node) {
   return activation_op_handler(graph, node, "popart_sigmoid");
 }
 
+Node *sign_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sign");
+}
+
+Node *sin_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sin");
+}
+
+Node *sinh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sinh");
+}
+
+Node *softplus_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_softplus");
+}
+
+Node *softsign_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_softsign");
+}
+
 Node *sqrt_handler(Graph *graph, Node *node) {
   return activation_op_handler(graph, node, "popart_sqrt");
 }
 
+Node *tan_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tan");
+}
+
+Node *tanh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tanh");
+}
+
 Node *gelu_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto approximate_ = BOOST_GET_CONST(bool, op->GetAttr("approximate"));
@@ -93,10 +165,28 @@ Node *log_softmax_handler(Graph *graph, Node *node) {
 }  // namespace platform
 }  // namespace paddle
 
-REGISTER_HANDLER(relu, relu_handler);
-REGISTER_HANDLER(tanh, tanh_handler);
+REGISTER_HANDLER(abs, abs_handler);
+REGISTER_HANDLER(acos, acos_handler);
+REGISTER_HANDLER(asin, asin_handler);
+REGISTER_HANDLER(atan, atan_handler);
+REGISTER_HANDLER(ceil, ceil_handler);
+REGISTER_HANDLER(cos, cos_handler);
+REGISTER_HANDLER(cosh, cosh_handler);
+REGISTER_HANDLER(erf, erf_handler);
+REGISTER_HANDLER(exp, exp_handler);
+REGISTER_HANDLER(floor, floor_handler);
 REGISTER_HANDLER(log, log_handler);
+REGISTER_HANDLER(reciprocal, reciprocal_handler);
+REGISTER_HANDLER(relu, relu_handler);
+REGISTER_HANDLER(round, round_handler);
 REGISTER_HANDLER(sigmoid, sigmoid_handler);
+REGISTER_HANDLER(sign, sign_handler);
+REGISTER_HANDLER(sin, sin_handler);
+REGISTER_HANDLER(sinh, sinh_handler);
+REGISTER_HANDLER(softplus, softplus_handler);
+REGISTER_HANDLER(softsign, softsign_handler);
 REGISTER_HANDLER(sqrt, sqrt_handler);
+REGISTER_HANDLER(tan, tan_handler);
+REGISTER_HANDLER(tanh, tanh_handler);
 REGISTER_HANDLER(gelu, gelu_handler);
 REGISTER_HANDLER(log_softmax, log_softmax_handler);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index d990aab57736d..8928f5417d1a5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -11,13 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/platform/device_context.h"
 
 #include <functional>
 #include <memory>
 #include <set>
 
+#include "glog/logging.h"
+#include "paddle/fluid/framework/expect.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/allocator.h"
@@ -26,17 +34,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
 #endif
-#include "glog/logging.h"
-#include "paddle/fluid/framework/expect.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-#include "paddle/fluid/platform/profiler.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -178,75 +180,89 @@ void DeviceContextPool::SetDeviceContexts(
 }
 
 template <typename DevCtx>
-inline void EmplaceDeviceContext(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        map_ptr,
-    platform::Place p) {
+std::unique_ptr<DeviceContext> CreateDeviceContext(
+    const platform::Place& p,
+    bool disable_setting_default_stream_for_allocator = false) {
   using PtrType = std::unique_ptr<DeviceContext>;
-  map_ptr->emplace(
-      p, std::async(std::launch::deferred, [=] {
-        // lazy evaluation. i.e., only create device context at
-        // first `Get`
-        auto* dev_ctx = new DevCtx(p);
-        if (is_gpu_place(p)) {
+  auto* dev_ctx = new DevCtx(p);
+  if (is_gpu_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
-          PADDLE_ENFORCE_NOT_NULL(
-              cuda_ctx,
-              platform::errors::InvalidArgument(
-                  "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
-          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
-                                    .GetAllocator(p)
-                                    .get());
-          dev_ctx->SetPinnedAllocator(
-              memory::allocation::AllocatorFacade::Instance()
-                  .GetAllocator(paddle::platform::CUDAPinnedPlace())
-                  .get());
-
-          cuda_ctx->PartialInitWithAllocator();
-          dev_ctx->SetGenerator(
-              framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
-#endif
-        } else {
-          dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
-                                    .GetAllocator(p)
-                                    .get());
-          dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get());
-        }
-        dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get());
-        dev_ctx->SetHostAllocator(
-            memory::allocation::AllocatorFacade::Instance()
-                .GetAllocator(platform::CPUPlace())
-                .get());
-        dev_ctx->SetZeroAllocator(
-            memory::allocation::AllocatorFacade::Instance()
-                .GetZeroAllocator(p)
-                .get());
-        return PtrType(dev_ctx);
-      }));
+    auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ctx,
+        platform::errors::InvalidArgument(
+            "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
+
+    auto& instance = memory::allocation::AllocatorFacade::Instance();
+    if (!disable_setting_default_stream_for_allocator) {
+      instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
+    }
+    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
+    dev_ctx->SetPinnedAllocator(
+        instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());
+
+    cuda_ctx->PartialInitWithAllocator();
+    dev_ctx->SetGenerator(
+        framework::DefaultCUDAGenerator(p.GetDeviceId()).get());
+#endif
+  } else {
+    dev_ctx->SetAllocator(
+        memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
+    dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get());
+  }
+  dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get());
+  dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
+                                .GetAllocator(platform::CPUPlace())
+                                .get());
+  dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
+                                .GetZeroAllocator(p)
+                                .get());
+  return PtrType(dev_ctx);
 }
 
-DeviceContextPool::DeviceContextPool(
-    const std::vector<platform::Place>& places) {
+template <typename DevCtx>
+inline void EmplaceDeviceContext(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    platform::Place place, bool disable_setting_default_stream_for_allocator) {
+  // lazy evaluation. i.e., only create device context at first `Get`
+  place_to_device_context->emplace(
+      place, std::async(std::launch::deferred, CreateDeviceContext<DevCtx>,
+                        place, disable_setting_default_stream_for_allocator));
+}
+
+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const std::vector<platform::Place>& places,
+    bool disable_setting_default_stream_for_allocator) {
   PADDLE_ENFORCE_GT(
       places.size(), 0,
       platform::errors::InvalidArgument("The number of platform places should "
                                         "be larger than 0. But received %d.",
                                         places.size()));
+
   std::set<Place> set;
   for (auto& p : places) {
     set.insert(p);
   }
+
   for (auto& p : set) {
     if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      EmplaceDeviceContext<MKLDNNDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<MKLDNNDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
-      EmplaceDeviceContext<CPUDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<CPUDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #endif
     } else if (platform::is_gpu_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      EmplaceDeviceContext<CUDADeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<CUDADeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("CUDAPlace is not supported. Please "
@@ -254,7 +270,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      EmplaceDeviceContext<CUDAPinnedDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<CUDAPinnedDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "CUDAPlace is not supported. Please re-compile with WITH_GPU "
@@ -262,7 +280,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_xpu_place(p)) {
 #ifdef PADDLE_WITH_XPU
-      EmplaceDeviceContext<XPUDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<XPUDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("XPUPlace is not supported. Please "
@@ -270,7 +290,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_mlu_place(p)) {
 #ifdef PADDLE_WITH_MLU
-      EmplaceDeviceContext<MLUDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<MLUDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("MLUPlace is not supported. Please "
@@ -278,7 +300,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_ipu_place(p)) {
 #ifdef PADDLE_WITH_IPU
-      EmplaceDeviceContext<IPUDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<IPUDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("IPUPlace is not supported. Please "
@@ -286,7 +310,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<NPUDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPlace is not supported. Please "
@@ -294,7 +320,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_npu_pinned_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUPinnedDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<NPUPinnedDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "NPUPinnedPlace is not supported. Please re-compile with "
@@ -303,7 +331,9 @@ DeviceContextPool::DeviceContextPool(
 #endif
     } else if (platform::is_custom_place(p)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-      EmplaceDeviceContext<CustomDeviceContext>(&device_contexts_, p);
+      EmplaceDeviceContext<CustomDeviceContext>(
+          place_to_device_context, p,
+          disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "CustomPlace is not supported. Please re-compile with "
@@ -314,6 +344,12 @@ DeviceContextPool::DeviceContextPool(
   }
 }
 
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  EmplaceDeviceContexts(&device_contexts_, places,
+                        /*disable_setting_default_stream_for_allocator=*/false);
+}
+
 CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() {
   phi::CPUContext::Init();
 }
@@ -556,10 +592,6 @@ CUDAContext::~CUDAContext() {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
   phi::GPUContext::PartialInitWithoutAllocator();
   cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
-  auto& instance = memory::allocation::AllocatorFacade::Instance();
-  instance.SetDefaultStream(place, phi::GPUContext::stream());
-  workspace_.reset(new phi::DnnWorkspaceHandle(
-      instance.GetAllocator(place).get(), stream()));
 }
 
 CUDADeviceContext::~CUDADeviceContext() = default;
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 1855f43f9d6cf..9ba9307d289eb 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -645,7 +645,6 @@ class CUDADeviceContext : public phi::GPUContext {
   // NOTE: Just for compatibility with the past, please delete if there is an
   // elegant way.
   std::unique_ptr<stream::CUDAStream> cuda_stream_;
-  std::unique_ptr<phi::DnnWorkspaceHandle> workspace_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };
@@ -883,11 +882,15 @@ struct DefaultDeviceContextType<platform::CustomPlace> {
 };
 #endif
 
+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const std::vector<platform::Place>& places,
+    bool disable_setting_default_stream_for_allocator);
+
 /*! \brief device context pool singleton */
 class DeviceContextPool {
  public:
-  explicit DeviceContextPool(const std::vector<platform::Place>& places);
-
   static DeviceContextPool& Instance() {
     PADDLE_ENFORCE_NOT_NULL(pool,
                             platform::errors::PreconditionNotMet(
@@ -925,6 +928,8 @@ class DeviceContextPool {
                      std::shared_future<std::unique_ptr<DeviceContext>>>*);
 
  private:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
   static DeviceContextPool* pool;
   std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
       device_contexts_;
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index bba0ad35e0216..fbccfe5265a71 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -42,10 +42,14 @@ if(TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
+if(CUSPARSELT_FOUND)
+  list(APPEND CUDA_SRCS cusparseLt.cc)
+endif()
+
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if(CUPTI_FOUND)
   list(APPEND CUDA_SRCS cupti.cc)
-endif(CUPTI_FOUND)
+endif()
 if(WITH_ROCM)
   hip_library(
     dynload_cuda
diff --git a/paddle/fluid/platform/dynload/cusparseLt.cc b/paddle/fluid/platform/dynload/cusparseLt.cc
new file mode 100644
index 0000000000000..ae2aec012b7b7
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cusparseLt.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/cusparseLt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#ifdef CUSPARSELT_ROUTINE_EACH
+CUSPARSELT_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cusparseLt.h b/paddle/fluid/platform/dynload/cusparseLt.h
new file mode 100644
index 0000000000000..feb13ec63c1f0
--- /dev/null
+++ b/paddle/fluid/platform/dynload/cusparseLt.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusparseLt.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/cusparseLt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP(__name) \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name;  \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 11020
+#define CUSPARSELT_ROUTINE_EACH(__macro)       \
+  __macro(cusparseLtInit);                     \
+  __macro(cusparseLtDestroy);                  \
+  __macro(cusparseLtDenseDescriptorInit);      \
+  __macro(cusparseLtStructuredDescriptorInit); \
+  __macro(cusparseLtMatmulDescriptorInit);     \
+  __macro(cusparseLtMatmulDescSetAttribute);   \
+  __macro(cusparseLtMatmulAlgSelectionInit);   \
+  __macro(cusparseLtMatmulAlgSetAttribute);    \
+  __macro(cusparseLtMatmulGetWorkspace);       \
+  __macro(cusparseLtMatmulPlanInit);           \
+  __macro(cusparseLtMatDescriptorDestroy);     \
+  __macro(cusparseLtSpMMACompressedSize2);     \
+  __macro(cusparseLtSpMMACompress2);           \
+  __macro(cusparseLtMatmulSearch);             \
+  __macro(cusparseLtMatmulAlgGetAttribute);    \
+  __macro(cusparseLtMatmulPlanDestroy);        \
+  __macro(cusparseLtMatmul);                   \
+  __macro(cusparseGetErrorString);
+
+CUSPARSELT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP);
+#endif
+#endif
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 2f24e1b87daba..b64bf81dc0d05 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -72,6 +72,10 @@ void* GetCUFFTDsoHandle() { return phi::dynload::GetCUFFTDsoHandle(); }
 
 void* GetMKLRTDsoHandle() { return phi::dynload::GetMKLRTDsoHandle(); }
 
+void* GetCusparseLtDsoHandle() {
+  return phi::dynload::GetCusparseLtDsoHandle();
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index ca60cd76a59e1..50714dfb302eb 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -46,6 +46,7 @@ void* GetNvtxDsoHandle();
 void* GetCUFFTDsoHandle();
 void* GetMKLRTDsoHandle();
 void* GetROCFFTDsoHandle();
+void* GetCusparseLtDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index f728a820bd73c..72d343692df73 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-static const char* kSchemaVersion = "1.0.0";
+static const char* kSchemaVersion = "1.0.1";
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
 static uint32_t span_indx = 0;
 
@@ -37,14 +37,6 @@ static std::string DefaultFileName() {
                        GetStringFormatLocalTime().c_str());
 }
 
-const char* ChromeTracingLogger::categary_name_[] = {
-    "Operator",      "Dataloader",  "ProfileStep",
-    "CudaRuntime",   "Kernel",      "Memcpy",
-    "Memset",        "UserDefined", "OperatorInner",
-    "Forward",       "Backward",    "Optimization",
-    "Communication", "PythonOp",    "PythonUserDefined",
-    "MluRuntime"};
-
 void ChromeTracingLogger::OpenFile() {
   output_file_stream_.open(filename_,
                            std::ofstream::out | std::ofstream::trunc);
@@ -116,10 +108,41 @@ void ChromeTracingLogger::LogNodeTrees(const NodeTrees& node_trees) {
           (*devicenode)->LogMe(this);
         }
       }
+      for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
+           memnode != (*hostnode)->GetMemTraceEventNodes().end(); ++memnode) {
+        (*memnode)->LogMe(this);
+      }
     }
   }
 }
 
+void ChromeTracingLogger::LogMemTraceEventNode(
+    const MemTraceEventNode& mem_node) {
+  if (!output_file_stream_) {
+    return;
+  }
+  output_file_stream_ << string_format(
+      std::string(
+          R"JSON(
+  { 
+    "name": "[memory]", "pid": %lld, "tid": "%lld",
+    "ts": %lld, 
+    "ph": "i", "cat": "%s", 
+    "args": {
+      "place": "%s",
+      "addr": "%llu",
+      "current_allocated": %llu,
+      "current_reserved": %llu,
+      "increase_bytes": %lld
+    }
+  },
+  )JSON"),
+      mem_node.ProcessId(), mem_node.ThreadId(), mem_node.TimeStampNs(),
+      StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(),
+      mem_node.Addr(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(),
+      mem_node.IncreaseBytes());
+}
+
 void ChromeTracingLogger::LogHostTraceEventNode(
     const HostTraceEventNode& host_node) {
   if (!output_file_stream_) {
@@ -132,6 +155,16 @@ void ChromeTracingLogger::LogHostTraceEventNode(
   } else {
     dur_display = string_format(std::string("%.3f us"), dur * 1000);
   }
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> input_dtypes;
+  std::string callstack;
+  OperatorSupplementEventNode* op_supplement_node =
+      host_node.GetOperatorSupplementEventNode();
+  if (op_supplement_node != nullptr) {
+    input_shapes = op_supplement_node->InputShapes();
+    input_dtypes = op_supplement_node->Dtypes();
+    callstack = op_supplement_node->CallStack();
+  }
   switch (host_node.Type()) {
     case TracerEventType::ProfileStep:
     case TracerEventType::Forward:
@@ -159,10 +192,48 @@ void ChromeTracingLogger::LogHostTraceEventNode(
           host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
           host_node.ThreadId(), nsToUs(host_node.StartNs()),
           nsToUsFloat(host_node.Duration()),
-          categary_name_[static_cast<int>(host_node.Type())],
+          StringTracerEventType(host_node.Type()),
           nsToUsFloat(host_node.StartNs(), start_time_),
           nsToUsFloat(host_node.EndNs(), start_time_));
       break;
+
+    case TracerEventType::Operator:
+
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
+  { 
+    "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, "dur": %.3f,
+    "ph": "X", "cat": "%s", 
+    "cname": "thread_state_runnable",
+    "args": {
+      "start_time": "%.3f us",
+      "end_time": "%.3f us",
+      "input_shapes": %s,
+      "input_dtypes": %s,
+      "callstack": "%s"
+    }
+  },
+  )JSON"),
+          host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
+          host_node.ThreadId(), nsToUs(host_node.StartNs()),
+          nsToUsFloat(host_node.Duration()),
+          StringTracerEventType(host_node.Type()),
+          nsToUsFloat(host_node.StartNs(), start_time_),
+          nsToUsFloat(host_node.EndNs(), start_time_),
+          json_dict(input_shapes).c_str(), json_dict(input_dtypes).c_str(),
+          callstack.c_str());
+      break;
+    case TracerEventType::CudaRuntime:
+    case TracerEventType::Kernel:
+    case TracerEventType::Memcpy:
+    case TracerEventType::Memset:
+    case TracerEventType::UserDefined:
+    case TracerEventType::OperatorInner:
+    case TracerEventType::Communication:
+    case TracerEventType::MluRuntime:
+    case TracerEventType::NumTypes:
     default:
       output_file_stream_ << string_format(
           std::string(
@@ -181,7 +252,7 @@ void ChromeTracingLogger::LogHostTraceEventNode(
           host_node.Name().c_str(), dur_display.c_str(), host_node.ProcessId(),
           host_node.ThreadId(), nsToUs(host_node.StartNs()),
           nsToUsFloat(host_node.Duration()),
-          categary_name_[static_cast<int>(host_node.Type())],
+          StringTracerEventType(host_node.Type()),
           nsToUsFloat(host_node.StartNs(), start_time_),
           nsToUsFloat(host_node.EndNs(), start_time_));
       break;
@@ -220,8 +291,7 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       runtime_node.Name().c_str(), dur_display.c_str(),
       runtime_node.ProcessId(), runtime_node.ThreadId(),
       nsToUs(runtime_node.StartNs()), nsToUsFloat(runtime_node.Duration()),
-      categary_name_[static_cast<int>(runtime_node.Type())],
-      runtime_node.CorrelationId(),
+      StringTracerEventType(runtime_node.Type()), runtime_node.CorrelationId(),
       nsToUsFloat(runtime_node.StartNs(), start_time_),
       nsToUsFloat(runtime_node.EndNs(), start_time_));
   pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
@@ -347,7 +417,7 @@ void ChromeTracingLogger::HandleTypeKernel(
       device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUsFloat(device_node.Duration()),
-      categary_name_[static_cast<int>(device_node.Type())],
+      StringTracerEventType(device_node.Type()),
       nsToUsFloat(device_node.StartNs(), start_time_),
       nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
       device_node.ContextId(), device_node.StreamId(),
@@ -391,7 +461,7 @@ void ChromeTracingLogger::HandleTypeMemcpy(
       device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUsFloat(device_node.Duration()),
-      categary_name_[static_cast<int>(device_node.Type())],
+      StringTracerEventType(device_node.Type()),
       nsToUsFloat(device_node.StartNs(), start_time_),
       nsToUsFloat(device_node.EndNs(), start_time_), device_node.StreamId(),
       device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
@@ -427,7 +497,7 @@ void ChromeTracingLogger::HandleTypeMemset(
       device_node.Name().c_str(), dur_display.c_str(), device_node.DeviceId(),
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUsFloat(device_node.Duration()),
-      categary_name_[static_cast<int>(device_node.Type())],
+      StringTracerEventType(device_node.Type()),
       nsToUsFloat(device_node.StartNs(), start_time_),
       nsToUsFloat(device_node.EndNs(), start_time_), device_node.DeviceId(),
       device_node.ContextId(), device_node.StreamId(),
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 12d98d1ef0c63..3cbf9ccf6a0cc 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -37,6 +37,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
   void LogMetaInfo(const std::unordered_map<std::string, std::string>);
+  void LogMemTraceEventNode(const MemTraceEventNode&) override;
 
  private:
   void OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index 5253ecc505dbb..002071de0d1ef 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -27,7 +27,9 @@ using paddle::platform::HostTraceEventNode;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::MemTraceEvent;
 using paddle::platform::NodeTrees;
+using paddle::platform::OperatorSupplementEvent;
 using paddle::platform::ProfilerResult;
 using paddle::platform::RuntimeTraceEvent;
 using paddle::platform::SerializationLogger;
@@ -37,6 +39,8 @@ TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
   std::list<DeviceTraceEvent> device_events;
+  std::list<MemTraceEvent> mem_events;
+  std::list<OperatorSupplementEvent> op_supplement_events;
   host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
                                        TracerEventType::Dataloader, 1000, 10000,
                                        10, 10));
@@ -72,7 +76,8 @@ TEST(SerializationLoggerTest, dump_case0) {
       DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
                        69000, 0, 10, 11, 5, MemsetEventInfo()));
   SerializationLogger logger("test_serialization_logger_case0.pb");
-  NodeTrees tree(host_events, runtime_events, device_events);
+  NodeTrees tree(host_events, runtime_events, device_events, mem_events,
+                 op_supplement_events);
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree.Traverse(true);
   EXPECT_EQ(nodes[10].size(), 4u);
@@ -101,6 +106,8 @@ TEST(SerializationLoggerTest, dump_case1) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
   std::list<DeviceTraceEvent> device_events;
+  std::list<MemTraceEvent> mem_events;
+  std::list<OperatorSupplementEvent> op_supplement_events;
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
                                              17000, 10, 10, 1, 0));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
@@ -127,7 +134,8 @@ TEST(SerializationLoggerTest, dump_case1) {
       DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
                        69000, 0, 10, 11, 5, MemsetEventInfo()));
   SerializationLogger logger("test_serialization_logger_case1.pb");
-  NodeTrees tree(host_events, runtime_events, device_events);
+  NodeTrees tree(host_events, runtime_events, device_events, mem_events,
+                 op_supplement_events);
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree.Traverse(true);
   EXPECT_EQ(nodes[10].size(), 1u);
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index e1af63ad8909c..ca555d9c7b928 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <set>
 #include <stack>
 
+#include "paddle/fluid/platform/profiler/utils.h"
+
 namespace paddle {
 namespace platform {
 
@@ -50,8 +52,10 @@ NodeTrees::~NodeTrees() {
 
 void NodeTrees::BuildTrees(
     const std::vector<HostTraceEventNode*>& host_event_nodes,
-    std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
-    const std::vector<DeviceTraceEventNode*>& device_event_nodes) {
+    const std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
+    const std::vector<DeviceTraceEventNode*>& device_event_nodes,
+    const std::vector<MemTraceEventNode*>& mem_event_nodes,
+    const std::vector<OperatorSupplementEventNode*>& op_supplement_events) {
   // separate Host Event Nodes into different threads
   std::map<uint64_t, std::vector<HostTraceEventNode*>>
       thread2host_event_nodes;  // used to store HostTraceEventNodes per thread
@@ -59,6 +63,15 @@ void NodeTrees::BuildTrees(
       thread2runtime_event_nodes;  // used to store CudaRuntimeTraceEventNode
                                    // per
                                    // thread
+  std::map<uint64_t, std::vector<MemTraceEventNode*>>
+      thread2mem_event_nodes;  // used to store MemTraceEventNode
+                               // per
+                               // thread
+  std::map<uint64_t, std::vector<OperatorSupplementEventNode*>>
+      thread2op_supplement_event_nodes;  // used to store
+                                         // OperatorSupplementEventNode
+                                         // per
+                                         // thread
   std::map<uint32_t, CudaRuntimeTraceEventNode*>
       correlation_id2runtime_event_node;  // used to store the relation between
                                           // correlation id and runtime node
@@ -85,6 +98,15 @@ void NodeTrees::BuildTrees(
                                    "no corresponding cuda runtime events"));
     dst_iter->second->AddDeviceTraceEventNode(*it);
   }
+  // construct thread2mem_event_nodes
+  for (auto it = mem_event_nodes.begin(); it != mem_event_nodes.end(); ++it) {
+    thread2mem_event_nodes[(*it)->ThreadId()].push_back(*it);
+  }
+  // construct thread2op_supplement_event_nodes
+  for (auto it = op_supplement_events.begin(); it != op_supplement_events.end();
+       ++it) {
+    thread2op_supplement_event_nodes[(*it)->ThreadId()].push_back(*it);
+  }
   // sort host event nodes and runtime event nodes according to start_ns and
   // end_ns
   // the smaller start_ns is, the further ahead position is.
@@ -119,6 +141,29 @@ void NodeTrees::BuildTrees(
           return false;
         });
   }
+  // sort mem event nodes and operator supplement event nodes
+  for (auto it = thread2mem_event_nodes.begin();
+       it != thread2mem_event_nodes.end(); ++it) {
+    std::sort(it->second.begin(), it->second.end(),
+              [](MemTraceEventNode* node1, MemTraceEventNode* node2) {
+                if (node1->TimeStampNs() <= node2->TimeStampNs()) {
+                  return true;
+                }
+                return false;
+              });
+  }
+
+  for (auto it = thread2op_supplement_event_nodes.begin();
+       it != thread2op_supplement_event_nodes.end(); ++it) {
+    std::sort(it->second.begin(), it->second.end(),
+              [](OperatorSupplementEventNode* node1,
+                 OperatorSupplementEventNode* node2) {
+                if (node1->TimeStampNs() <= node2->TimeStampNs()) {
+                  return true;
+                }
+                return false;
+              });
+  }
 
   // construct trees
   std::set<uint64_t> thread_set;
@@ -131,16 +176,27 @@ void NodeTrees::BuildTrees(
        it != thread2runtime_event_nodes.end(); ++it) {
     thread_set.insert(it->first);
   }
+  for (auto it = thread2mem_event_nodes.begin();
+       it != thread2mem_event_nodes.end(); ++it) {
+    thread_set.insert(it->first);
+  }
+  for (auto it = thread2op_supplement_event_nodes.begin();
+       it != thread2op_supplement_event_nodes.end(); ++it) {
+    thread_set.insert(it->first);
+  }
 
   for (auto it = thread_set.begin(); it != thread_set.end(); ++it) {
     thread_event_trees_map_[*it] = BuildTreeRelationship(
-        thread2host_event_nodes[*it], thread2runtime_event_nodes[*it]);
+        thread2host_event_nodes[*it], thread2runtime_event_nodes[*it],
+        thread2mem_event_nodes[*it], thread2op_supplement_event_nodes[*it]);
   }
 }
 
 HostTraceEventNode* NodeTrees::BuildTreeRelationship(
     std::vector<HostTraceEventNode*> host_event_nodes,
-    std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes) {
+    std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
+    std::vector<MemTraceEventNode*> mem_event_nodes,
+    std::vector<OperatorSupplementEventNode*> op_supplement_events) {
   // a stack used for analyse relationship
   auto node_stack = std::vector<HostTraceEventNode*>();
   // root node, top level
@@ -226,6 +282,99 @@ HostTraceEventNode* NodeTrees::BuildTreeRelationship(
     }
     node_stack.pop_back();
   }
+
+  // build relationship between host event node and mem event node
+  // First, post-order traverse the tree. Then, insert the memory and op
+  // supplement node into correct host nodes.
+  auto stack = std::stack<HostTraceEventNode*>();
+  auto flag_stack = std::stack<int32_t>();
+  auto post_order_nodes = std::vector<HostTraceEventNode*>();
+  stack.push(root_node);
+  flag_stack.push(0);
+  while (!stack.empty()) {
+    auto current_node = stack.top();
+    stack.pop();
+    auto flag = flag_stack.top();
+    flag_stack.pop();
+    if (flag == 0) {
+      stack.push(current_node);
+      flag_stack.push(1);
+      for (auto child = current_node->GetChildren().rbegin();
+           child != current_node->GetChildren().rend(); ++child) {
+        stack.push(*child);
+        flag_stack.push(0);
+      }
+    } else {
+      post_order_nodes.push_back(current_node);
+    }
+  }
+
+  for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
+    bool hasenter = false;
+    std::vector<MemTraceEventNode*>::iterator firstposition;
+    std::vector<MemTraceEventNode*>::iterator lastposition =
+        mem_event_nodes.end();
+    for (auto mem_it = mem_event_nodes.begin(); mem_it < mem_event_nodes.end();
+         ++mem_it) {
+      if ((*mem_it)->TimeStampNs() >= (*it)->StartNs() &&
+          (*mem_it)->TimeStampNs() <= (*it)->EndNs()) {
+        (*it)->AddMemNode(*mem_it);
+        if (!hasenter) {
+          firstposition = mem_it;
+          hasenter = true;
+        }
+      } else {
+        if ((*mem_it)->TimeStampNs() > (*it)->EndNs()) {
+          lastposition = mem_it;
+          break;
+        }
+      }
+    }
+    if (hasenter) {
+      mem_event_nodes.erase(firstposition, lastposition);
+    }
+  }
+
+  // build relationship between host event node and op supplement node
+  for (auto it = post_order_nodes.begin(); it < post_order_nodes.end(); ++it) {
+    int op_supplement_count = 0;
+    bool hasenter = false;
+    std::vector<OperatorSupplementEventNode*>::iterator firstposition;
+    std::vector<OperatorSupplementEventNode*>::iterator lastposition =
+        op_supplement_events.end();
+    for (auto op_supplement_it = op_supplement_events.begin();
+         op_supplement_it < op_supplement_events.end(); ++op_supplement_it) {
+      if ((*op_supplement_it)->TimeStampNs() >= (*it)->StartNs() &&
+          (*op_supplement_it)->TimeStampNs() <= (*it)->EndNs()) {
+        if (!hasenter) {
+          firstposition = op_supplement_it;
+          hasenter = true;
+        }
+        (*it)->SetOperatorSupplementNode(*op_supplement_it);
+        PADDLE_ENFORCE_EQ((*it)->Type(), TracerEventType::Operator,
+                          platform::errors::PreconditionNotMet(
+                              "Operator supplement events should be embraced "
+                              "by event of type TracerEventType::Operator, "
+                              "but got type TracerEventType::%s",
+                              StringTracerEventType((*it)->Type())));
+        op_supplement_count += 1;
+      } else {
+        if ((*op_supplement_it)->TimeStampNs() > (*it)->EndNs()) {
+          PADDLE_ENFORCE_LE(op_supplement_count, 1,
+                            platform::errors::PreconditionNotMet(
+                                "One event of TracerEventType::Operator has no "
+                                "more than 1 op supplement event, but got %d.",
+                                op_supplement_count));
+          lastposition = op_supplement_it;
+          break;
+        }
+      }
+    }
+    if (hasenter) {
+      op_supplement_events.erase(firstposition, lastposition);
+    }
+  }
+
   return root_node;
 }
 
@@ -263,8 +412,8 @@ std::map<uint64_t, std::vector<HostTraceEventNode*>> NodeTrees::Traverse(
         auto current_node = stack.top();
         stack.pop();
         thread2host_event_nodes[thread_id].push_back(current_node);
-        for (auto child = current_node->GetChildren().begin();
-             child != current_node->GetChildren().end(); ++child) {
+        for (auto child = current_node->GetChildren().rbegin();
+             child != current_node->GetChildren().rend(); ++child) {
           stack.push(*child);
         }
       }
@@ -278,7 +427,10 @@ void NodeTrees::LogMe(BaseLogger* logger) { logger->LogNodeTrees(*this); }
 void NodeTrees::HandleTrees(
     std::function<void(HostTraceEventNode*)> host_event_node_handle,
     std::function<void(CudaRuntimeTraceEventNode*)> runtime_event_node_handle,
-    std::function<void(DeviceTraceEventNode*)> device_event_node_handle) {
+    std::function<void(DeviceTraceEventNode*)> device_event_node_handle,
+    std::function<void(MemTraceEventNode*)> mem_event_node_handle,
+    std::function<void(OperatorSupplementEventNode*)>
+        op_supplement_node_handle) {
   // using different user-defined function to handle different nodes
   const std::map<uint64_t, std::vector<HostTraceEventNode*>>
       thread2host_event_nodes = Traverse(true);
@@ -300,6 +452,15 @@ void NodeTrees::HandleTrees(
           device_event_node_handle(*devicenode);
         }
       }
+      for (auto memeventnode = (*hostnode)->GetMemTraceEventNodes().begin();
+           memeventnode != (*hostnode)->GetMemTraceEventNodes().end();
+           ++memeventnode) {
+        mem_event_node_handle(*memeventnode);
+      }
+      if ((*hostnode)->GetOperatorSupplementEventNode()) {
+        op_supplement_node_handle(
+            (*hostnode)->GetOperatorSupplementEventNode());
+      }
     }
   }
 }
diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h
index 3e589b0be2e04..acd5a03109f72 100644
--- a/paddle/fluid/platform/profiler/event_node.h
+++ b/paddle/fluid/platform/profiler/event_node.h
@@ -21,12 +21,67 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/output_logger.h"
 #include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
 namespace platform {
 
+class MemTraceEventNode {
+ public:
+  // constructor
+  explicit MemTraceEventNode(const MemTraceEvent& mem_event)
+      : mem_event_(mem_event) {}
+
+  // destructor
+  ~MemTraceEventNode();
+
+  // getter
+  TracerMemEventType Type() const { return mem_event_.type; }
+  uint64_t Addr() const { return mem_event_.addr; }
+  uint64_t TimeStampNs() const { return mem_event_.timestamp_ns; }
+  uint64_t ProcessId() const { return mem_event_.process_id; }
+  uint64_t ThreadId() const { return mem_event_.thread_id; }
+  int64_t IncreaseBytes() const { return mem_event_.increase_bytes; }
+  std::string Place() const { return mem_event_.place; }
+  uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
+  uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
+
+  // member function
+  void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
+
+ private:
+  // data
+  MemTraceEvent mem_event_;
+};
+
+class OperatorSupplementEventNode {
+ public:
+  // constructor
+  explicit OperatorSupplementEventNode(
+      const OperatorSupplementEvent& op_supplement_event)
+      : op_supplement_event_(op_supplement_event) {}
+  // destructor
+  ~OperatorSupplementEventNode() {}
+  // getter
+  std::string Name() const { return op_supplement_event_.op_type; }
+  uint64_t TimeStampNs() const { return op_supplement_event_.timestamp_ns; }
+  std::map<std::string, std::vector<std::vector<int64_t>>>& InputShapes() {
+    return op_supplement_event_.input_shapes;
+  }
+  std::map<std::string, std::vector<std::string>>& Dtypes() {
+    return op_supplement_event_.dtypes;
+  }
+  std::string CallStack() { return op_supplement_event_.callstack; }
+  uint64_t ProcessId() const { return op_supplement_event_.process_id; }
+  uint64_t ThreadId() const { return op_supplement_event_.thread_id; }
+
+ private:
+  // data
+  OperatorSupplementEvent op_supplement_event_;
+};
+
 class DeviceTraceEventNode {
  public:
   // constructor
@@ -139,6 +194,10 @@ class HostTraceEventNode {
   void AddCudaRuntimeNode(CudaRuntimeTraceEventNode* node) {
     runtime_node_ptrs_.push_back(node);
   }
+  void AddMemNode(MemTraceEventNode* node) { mem_node_ptrs_.push_back(node); }
+  void SetOperatorSupplementNode(OperatorSupplementEventNode* node) {
+    op_supplement_node_ptr_ = node;
+  }
   const std::vector<HostTraceEventNode*>& GetChildren() const {
     return children_;
   }
@@ -146,6 +205,14 @@ class HostTraceEventNode {
       const {
     return runtime_node_ptrs_;
   }
+  const std::vector<MemTraceEventNode*>& GetMemTraceEventNodes() const {
+    return mem_node_ptrs_;
+  }
+
+  OperatorSupplementEventNode* GetOperatorSupplementEventNode() const {
+    return op_supplement_node_ptr_;
+  }
+
   void LogMe(BaseLogger* logger) { logger->LogHostTraceEventNode(*this); }
 
  private:
@@ -155,6 +222,9 @@ class HostTraceEventNode {
   std::vector<CudaRuntimeTraceEventNode*> runtime_node_ptrs_;
   // host events called by this
   std::vector<HostTraceEventNode*> children_;
+  // memory events happened in this event period
+  std::vector<MemTraceEventNode*> mem_node_ptrs_;
+  OperatorSupplementEventNode* op_supplement_node_ptr_ = nullptr;
 };
 
 class NodeTrees {
@@ -162,10 +232,14 @@ class NodeTrees {
   // constructor
   NodeTrees(const std::list<HostTraceEvent>& host_events,
             const std::list<RuntimeTraceEvent>& runtime_events,
-            const std::list<DeviceTraceEvent>& device_events) {
+            const std::list<DeviceTraceEvent>& device_events,
+            const std::list<MemTraceEvent>& mem_events,
+            const std::list<OperatorSupplementEvent>& op_supplement_events) {
     std::vector<HostTraceEventNode*> host_event_nodes;
     std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes;
     std::vector<DeviceTraceEventNode*> device_event_nodes;
+    std::vector<MemTraceEventNode*> mem_event_nodes;
+    std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes;
     // encapsulate event into nodes
     for (auto it = host_events.begin(); it != host_events.end(); ++it) {
       host_event_nodes.push_back(new HostTraceEventNode(*it));
@@ -176,8 +250,16 @@ class NodeTrees {
     for (auto it = device_events.begin(); it != device_events.end(); ++it) {
       device_event_nodes.push_back(new DeviceTraceEventNode(*it));
     }
+    for (auto it = mem_events.begin(); it != mem_events.end(); ++it) {
+      mem_event_nodes.push_back(new MemTraceEventNode(*it));
+    }
+    for (auto it = op_supplement_events.begin();
+         it != op_supplement_events.end(); ++it) {
+      op_supplement_event_nodes.push_back(new OperatorSupplementEventNode(*it));
+    }
     // build tree
-    BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes);
+    BuildTrees(host_event_nodes, runtime_event_nodes, device_event_nodes,
+               mem_event_nodes, op_supplement_event_nodes);
   }
 
   explicit NodeTrees(
@@ -190,7 +272,9 @@ class NodeTrees {
   void LogMe(BaseLogger* logger);
   void HandleTrees(std::function<void(HostTraceEventNode*)>,
                    std::function<void(CudaRuntimeTraceEventNode*)>,
-                   std::function<void(DeviceTraceEventNode*)>);
+                   std::function<void(DeviceTraceEventNode*)>,
+                   std::function<void(MemTraceEventNode*)>,
+                   std::function<void(OperatorSupplementEventNode*)>);
   const std::map<uint64_t, HostTraceEventNode*>& GetNodeTrees() const {
     return thread_event_trees_map_;
   }
@@ -199,11 +283,15 @@ class NodeTrees {
  private:
   std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map_;
   void BuildTrees(const std::vector<HostTraceEventNode*>&,
-                  std::vector<CudaRuntimeTraceEventNode*>&,
-                  const std::vector<DeviceTraceEventNode*>&);
+                  const std::vector<CudaRuntimeTraceEventNode*>&,
+                  const std::vector<DeviceTraceEventNode*>&,
+                  const std::vector<MemTraceEventNode*>&,
+                  const std::vector<OperatorSupplementEventNode*>&);
   HostTraceEventNode* BuildTreeRelationship(
       std::vector<HostTraceEventNode*> host_event_nodes,
-      std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes);
+      std::vector<CudaRuntimeTraceEventNode*> runtime_event_nodes,
+      std::vector<MemTraceEventNode*> mem_event_nodes,
+      std::vector<OperatorSupplementEventNode*> op_supplement_event_nodes);
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h
index 05a68cf2a4a8d..47429eafa64ef 100644
--- a/paddle/fluid/platform/profiler/output_logger.h
+++ b/paddle/fluid/platform/profiler/output_logger.h
@@ -24,6 +24,7 @@ class DeviceTraceEventNode;       // forward declaration
 class HostTraceEventNode;         // forward declaration
 class CudaRuntimeTraceEventNode;  // forward declaration
 class NodeTrees;                  // forward declaration
+class MemTraceEventNode;          // forward declaration
 
 class BaseLogger {
  public:
@@ -33,6 +34,7 @@ class BaseLogger {
   virtual void LogHostTraceEventNode(const HostTraceEventNode&) {}
   virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {}
   virtual void LogNodeTrees(const NodeTrees&) {}
+  virtual void LogMemTraceEventNode(const MemTraceEventNode&) {}
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 8bcf856c01ab6..8e9d8bef605e6 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -101,9 +101,10 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
     tracer.Get().StopTracing();
     tracer.Get().CollectTraceData(&collector);
   }
-  std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
-                                                collector.RuntimeEvents(),
-                                                collector.DeviceEvents()));
+  std::unique_ptr<NodeTrees> tree(
+      new NodeTrees(collector.HostEvents(), collector.RuntimeEvents(),
+                    collector.DeviceEvents(), collector.MemEvents(),
+                    collector.OperatorSupplementEvents()));
   cpu_utilization_.RecordEndTimeInfo();
   ExtraInfo extrainfo;
   extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc
index 23ad917b57d0e..b70034633ae66 100644
--- a/paddle/fluid/platform/profiler/test_event_node.cc
+++ b/paddle/fluid/platform/profiler/test_event_node.cc
@@ -25,13 +25,20 @@ using paddle::platform::HostTraceEventNode;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::MemTraceEvent;
+using paddle::platform::MemTraceEventNode;
 using paddle::platform::NodeTrees;
+using paddle::platform::OperatorSupplementEvent;
+using paddle::platform::OperatorSupplementEventNode;
 using paddle::platform::RuntimeTraceEvent;
 using paddle::platform::TracerEventType;
+using paddle::platform::TracerMemEventType;
 TEST(NodeTreesTest, LogMe_case0) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
   std::list<DeviceTraceEvent> device_events;
+  std::list<MemTraceEvent> mem_events;
+  std::list<OperatorSupplementEvent> op_supplement_events;
   host_events.push_back(HostTraceEvent(std::string("dataloader#1"),
                                        TracerEventType::Dataloader, 1000, 10000,
                                        10, 10));
@@ -41,6 +48,19 @@ TEST(NodeTreesTest, LogMe_case0) {
       std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
   host_events.push_back(HostTraceEvent(
       std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
+  mem_events.push_back(MemTraceEvent(11500, 0x1000,
+                                     TracerMemEventType::Allocate, 10, 10, 50,
+                                     "GPU:0", 50, 50));
+  mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
+                                     10, 10, -50, "GPU:0", 0, 50));
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
+  input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
+  dtypes[std::string("X")].push_back(std::string("int8"));
+  dtypes[std::string("X")].push_back(std::string("float32"));
+  op_supplement_events.push_back(OperatorSupplementEvent(
+      11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
                                              17000, 10, 10, 1, 0));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
@@ -67,7 +87,8 @@ TEST(NodeTreesTest, LogMe_case0) {
       DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
                        69000, 0, 10, 11, 5, MemsetEventInfo()));
   ChromeTracingLogger logger("test_nodetrees_logme_case0.json");
-  NodeTrees tree(host_events, runtime_events, device_events);
+  NodeTrees tree(host_events, runtime_events, device_events, mem_events,
+                 op_supplement_events);
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree.Traverse(true);
   EXPECT_EQ(nodes[10].size(), 4u);
@@ -81,6 +102,8 @@ TEST(NodeTreesTest, LogMe_case0) {
     if ((*it)->Name() == "op1") {
       EXPECT_EQ((*it)->GetChildren().size(), 0u);
       EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
     }
   }
   for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
@@ -90,12 +113,15 @@ TEST(NodeTreesTest, LogMe_case0) {
     }
   }
   tree.LogMe(&logger);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
 
 TEST(NodeTreesTest, LogMe_case1) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
   std::list<DeviceTraceEvent> device_events;
+  std::list<MemTraceEvent> mem_events;
+  std::list<OperatorSupplementEvent> op_supplement_events;
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
                                              17000, 10, 10, 1, 0));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000,
@@ -122,7 +148,8 @@ TEST(NodeTreesTest, LogMe_case1) {
       DeviceTraceEvent(std::string("memset1"), TracerEventType::Memset, 66000,
                        69000, 0, 10, 11, 5, MemsetEventInfo()));
   ChromeTracingLogger logger("test_nodetrees_logme_case1.json");
-  NodeTrees tree(host_events, runtime_events, device_events);
+  NodeTrees tree(host_events, runtime_events, device_events, mem_events,
+                 op_supplement_events);
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree.Traverse(true);
   EXPECT_EQ(nodes[10].size(), 1u);
@@ -141,18 +168,29 @@ TEST(NodeTreesTest, LogMe_case1) {
     }
   }
   tree.LogMe(&logger);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
 
 TEST(NodeTreesTest, HandleTrees_case0) {
   std::list<HostTraceEvent> host_events;
   std::list<RuntimeTraceEvent> runtime_events;
   std::list<DeviceTraceEvent> device_events;
+  std::list<MemTraceEvent> mem_events;
+  std::list<OperatorSupplementEvent> op_supplement_events;
   host_events.push_back(HostTraceEvent(
       std::string("op1"), TracerEventType::Operator, 10000, 100000, 10, 10));
   host_events.push_back(HostTraceEvent(
       std::string("op2"), TracerEventType::Operator, 30000, 70000, 10, 10));
   host_events.push_back(HostTraceEvent(
       std::string("op3"), TracerEventType::Operator, 2000, 120000, 10, 11));
+  mem_events.push_back(MemTraceEvent(11500, 0x1000,
+                                     TracerMemEventType::Allocate, 10, 10, 50,
+                                     "GPU:0", 50, 50));
+  mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free,
+                                     10, 10, -50, "GPU:0", 0, 50));
+  op_supplement_events.push_back(OperatorSupplementEvent(
+      11600, "op1", std::map<std::string, std::vector<std::vector<int64_t>>>(),
+      std::map<std::string, std::vector<std::string>>(), "op1()", 10, 10));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000,
                                              25000, 10, 10, 1, 0));
   runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 35000,
@@ -169,7 +207,8 @@ TEST(NodeTreesTest, HandleTrees_case0) {
       DeviceTraceEvent(std::string("kernel3"), TracerEventType::Kernel, 60000,
                        75000, 0, 10, 11, 3, KernelEventInfo()));
   ChromeTracingLogger logger("test_nodetrees_handletrees_case0.json");
-  NodeTrees tree(host_events, runtime_events, device_events);
+  NodeTrees tree(host_events, runtime_events, device_events, mem_events,
+                 op_supplement_events);
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree.Traverse(true);
   EXPECT_EQ(nodes[10].size(), 3u);
@@ -199,6 +238,12 @@ TEST(NodeTreesTest, HandleTrees_case0) {
       });
   std::function<void(DeviceTraceEventNode*)> device_event_node_handle(
       [&](DeviceTraceEventNode* a) { logger.LogDeviceTraceEventNode(*a); });
+  std::function<void(MemTraceEventNode*)> mem_event_node_handle(
+      [&](MemTraceEventNode* a) { logger.LogMemTraceEventNode(*a); });
+  std::function<void(OperatorSupplementEventNode*)>
+      op_supplement_event_node_handle([&](OperatorSupplementEventNode* a) {});
   tree.HandleTrees(host_event_node_handle, runtime_event_node_handle,
-                   device_event_node_handle);
+                   device_event_node_handle, mem_event_node_handle,
+                   op_supplement_event_node_handle);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index 6d398a26eda10..bfa000e2683de 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <string>
+#include <vector>
 
 namespace paddle {
 namespace platform {
@@ -56,6 +58,15 @@ enum class TracerEventType {
   NumTypes
 };
 
+enum class TracerMemEventType {
+  // Used to mark memory allocation
+  Allocate = 0,
+  // Used to mark memory free
+  Free = 1,
+  // A flag to denote the number of current types
+  NumTypes
+};
+
 struct KernelEventInfo {
   // The X-dimension block size for the kernel.
   uint32_t block_x;
@@ -118,6 +129,36 @@ struct MemsetEventInfo {
   uint32_t value;
 };
 
+struct OperatorSupplementEvent {
+  OperatorSupplementEvent() = default;
+  OperatorSupplementEvent(
+      uint64_t timestamp_ns, const std::string& op_type,
+      const std::map<std::string, std::vector<std::vector<int64_t>>>&
+          input_shapes,
+      const std::map<std::string, std::vector<std::string>>& dtypes,
+      const std::string& callstack, uint64_t process_id, uint64_t thread_id)
+      : timestamp_ns(timestamp_ns),
+        op_type(op_type),
+        input_shapes(input_shapes),
+        dtypes(dtypes),
+        callstack(callstack),
+        process_id(process_id),
+        thread_id(thread_id) {}
+  // timestamp of the record
+  uint64_t timestamp_ns;
+  // op type name
+  std::string op_type;
+  // input shapes
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  // call stack
+  std::string callstack;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+};
+
 struct HostTraceEvent {
   HostTraceEvent() = default;
   HostTraceEvent(const std::string& name, TracerEventType type,
@@ -242,5 +283,42 @@ struct DeviceTraceEvent {
   };
 };
 
+struct MemTraceEvent {
+  MemTraceEvent() = default;
+  MemTraceEvent(uint64_t timestamp_ns, uint64_t addr, TracerMemEventType type,
+                uint64_t process_id, uint64_t thread_id, int64_t increase_bytes,
+                const std::string& place, uint64_t current_allocated,
+                uint64_t current_reserved)
+      : timestamp_ns(timestamp_ns),
+        addr(addr),
+        type(type),
+        process_id(process_id),
+        thread_id(thread_id),
+        increase_bytes(increase_bytes),
+        place(place),
+        current_allocated(current_allocated),
+        current_reserved(current_reserved) {}
+
+  // timestamp of the record
+  uint64_t timestamp_ns;
+  // memory addr of allocation or free
+  uint64_t addr;
+  // memory manipulation type
+  TracerMemEventType type;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+  // increase bytes after this manipulation, allocation for sign +, free for
+  // sign -
+  int64_t increase_bytes;
+  // place
+  std::string place;
+  // current total allocated memory
+  uint64_t current_allocated;
+  // current total reserved memory
+  uint64_t current_reserved;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h
index d1593bc1bfcd7..7c2ea9e16c423 100644
--- a/paddle/fluid/platform/profiler/trace_event_collector.h
+++ b/paddle/fluid/platform/profiler/trace_event_collector.h
@@ -39,6 +39,12 @@ class TraceEventCollector {
     thread_names_[tid] = name;
   }
 
+  void AddMemEvent(MemTraceEvent&& event) { mem_events_.push_back(event); }
+
+  void AddOperatorSupplementEvent(OperatorSupplementEvent&& event) {
+    op_supplement_events_.push_back(event);
+  }
+
   const std::list<HostTraceEvent>& HostEvents() const { return host_events_; }
 
   const std::list<RuntimeTraceEvent>& RuntimeEvents() const {
@@ -49,6 +55,12 @@ class TraceEventCollector {
     return device_events_;
   }
 
+  const std::list<MemTraceEvent>& MemEvents() const { return mem_events_; }
+
+  const std::list<OperatorSupplementEvent>& OperatorSupplementEvents() const {
+    return op_supplement_events_;
+  }
+
   const std::unordered_map<uint64_t, std::string>& ThreadNames() const {
     return thread_names_;
   }
@@ -58,6 +70,8 @@ class TraceEventCollector {
     host_events_.clear();
     runtime_events_.clear();
     device_events_.clear();
+    mem_events_.clear();
+    op_supplement_events_.clear();
   }
 
  private:
@@ -65,6 +79,8 @@ class TraceEventCollector {
   std::list<HostTraceEvent> host_events_;
   std::list<RuntimeTraceEvent> runtime_events_;
   std::list<DeviceTraceEvent> device_events_;
+  std::list<MemTraceEvent> mem_events_;
+  std::list<OperatorSupplementEvent> op_supplement_events_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index de314d298c90e..1f8e113fdd914 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/profiler/utils.h"
 
+#include <sstream>
 #include <vector>
 
 #include "glog/logging.h"
@@ -21,6 +22,26 @@ limitations under the License. */
 
 namespace paddle {
 namespace platform {
+
+template <>
+std::string json_vector<std::string>(
+    const std::vector<std::string> type_vector) {
+  std::ostringstream res_stream;
+  auto count = type_vector.size();
+  res_stream << "[";
+  for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
+    if (count > 1) {
+      res_stream << "\"" << (*it) << "\""
+                 << ",";
+    } else {
+      res_stream << "\"" << (*it) << "\"";
+    }
+    count--;
+  }
+  res_stream << "]";
+  return res_stream.str();
+}
+
 #ifdef PADDLE_WITH_CUPTI
 float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
                             int32_t StaticSharedMemory,
@@ -61,5 +82,21 @@ float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread,
 }
 #endif
 
+const char* StringTracerMemEventType(TracerMemEventType type) {
+  static const char* categary_name_[] = {"Allocate", "Free"};
+  return categary_name_[static_cast<int>(type)];
+}
+
+const char* StringTracerEventType(TracerEventType type) {
+  static const char* categary_name_[] = {
+      "Operator",      "Dataloader",  "ProfileStep",
+      "CudaRuntime",   "Kernel",      "Memcpy",
+      "Memset",        "UserDefined", "OperatorInner",
+      "Forward",       "Backward",    "Optimization",
+      "Communication", "PythonOp",    "PythonUserDefined",
+      "MluRuntime"};
+  return categary_name_[static_cast<int>(type)];
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h
index 433fd0b825a11..5f7c420789f80 100644
--- a/paddle/fluid/platform/profiler/utils.h
+++ b/paddle/fluid/platform/profiler/utils.h
@@ -14,11 +14,15 @@ limitations under the License. */
 #pragma once
 
 #include <ctime>
+#include <map>
+#include <ostream>
 #include <string>
+#include <vector>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +40,64 @@ std::string string_format(const std::string& format, Args... args) {
   return std::string(buf.get(), size - 1);  // exclude the '\0'
 }
 
+template <typename basic_type>
+std::string json_vector(const std::vector<basic_type> type_vector) {
+  std::ostringstream res_stream;
+  auto count = type_vector.size();
+  res_stream << "[";
+  for (auto it = type_vector.begin(); it != type_vector.end(); it++) {
+    if (count > 1) {
+      res_stream << (*it) << ",";
+    } else {
+      res_stream << (*it);
+    }
+    count--;
+  }
+  res_stream << "]";
+  return res_stream.str();
+}
+
+template <typename basic_type>
+std::string json_vector(
+    const std::vector<std::vector<basic_type>> shape_vector) {
+  std::ostringstream res_stream;
+  auto count = shape_vector.size();
+  res_stream << "[";
+  for (auto it = shape_vector.begin(); it != shape_vector.end(); it++) {
+    if (count > 1) {
+      res_stream << json_vector(*it) << ",";
+    } else {
+      res_stream << json_vector(*it);
+    }
+    count--;
+  }
+  res_stream << "]";
+  return res_stream.str();
+}
+
+template <>
+std::string json_vector<std::string>(
+    const std::vector<std::string> type_vector);
+
+template <typename type>
+std::string json_dict(const std::map<std::string, std::vector<type>> data_map) {
+  std::ostringstream res_stream;
+  auto count = data_map.size();
+  res_stream << "{";
+  for (auto it = data_map.begin(); it != data_map.end(); it++) {
+    if (count > 1) {
+      res_stream << "\"" << it->first << "\""
+                 << ":" << json_vector(it->second) << ",";
+    } else {
+      res_stream << "\"" << it->first << "\""
+                 << ":" << json_vector(it->second);
+    }
+    count--;
+  }
+  res_stream << "}";
+  return res_stream.str();
+}
+
 static std::string GetStringFormatLocalTime() {
   std::time_t rawtime;
   std::tm* timeinfo;
@@ -50,6 +112,10 @@ static int64_t nsToUs(uint64_t end_ns, uint64_t start_ns = 0) {
   return (end_ns - start_ns) / 1000;
 }
 
+const char* StringTracerMemEventType(TracerMemEventType type);
+
+const char* StringTracerEventType(TracerEventType type);
+
 static float nsToUsFloat(uint64_t end_ns, uint64_t start_ns = 0) {
   return static_cast<float>(end_ns - start_ns) / 1000;
 }
@@ -63,5 +129,6 @@ float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread,
                             int32_t dynamicSharedMemory, int32_t blockX,
                             int32_t blockY, int32_t blockZ, float blocksPerSm);
 #endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index a99dded4d5af1..20460c78d2867 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -91,7 +91,7 @@ if(NOT WIN32)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
   endif()
-endif(NOT WIN32)
+endif()
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
@@ -153,12 +153,12 @@ if(WITH_GLOO)
   set(PYBIND_SRCS ${PYBIND_SRCS} gloo_context_py.cc)
   set(PYBIND_DEPS ${PYBIND_DEPS} imperative_gloo_context)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
-endif(WITH_GLOO)
+endif()
 
 if(WITH_CRYPTO)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_crypto)
   set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc)
-endif(WITH_CRYPTO)
+endif()
 
 if(WITH_PSLIB)
   set(DISTRIBUTE_COMPILE_FLAGS
@@ -169,7 +169,7 @@ if(WITH_PSLIB)
   endif()
   set_source_files_properties(
     heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-endif(WITH_PSLIB)
+endif()
 if(WITH_PSCORE)
   if(WITH_ARM_BRPC)
     set(DISTRIBUTE_COMPILE_FLAGS
@@ -223,15 +223,15 @@ if(WITH_PYTHON)
 
   if(WITH_XPU_BKCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context)
-  endif(WITH_XPU_BKCL)
+  endif()
 
   if(WITH_ASCEND_CL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context)
-  endif(WITH_ASCEND_CL)
+  endif()
 
   if(WITH_CNCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context)
-  endif(WITH_CNCL)
+  endif()
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
@@ -316,7 +316,7 @@ if(WITH_PYTHON)
         DEPENDS mklml)
       list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
-    else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
+    else()
       add_custom_command(
         OUTPUT ${op_impl_path}/openblas.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path}
@@ -370,7 +370,7 @@ if(WITH_PYTHON)
         COMMENT "copy_if_different ${tmp_eager_impl_file} to ${eager_impl_file}"
         DEPENDS ${EAGER_OP_IMPL_DEPS})
     endif()
-  else(WIN32)
+  else()
     # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
@@ -446,7 +446,7 @@ if(WITH_PYTHON)
         DEPENDS ${EAGER_OP_IMPL_DEPS}
         VERBATIM)
     endif()
-  endif(WIN32)
+  endif()
   add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     add_custom_target(eager_op_function_generator_cmd ALL
@@ -495,7 +495,7 @@ if(WITH_PYTHON)
 
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
-  endif(NOT APPLE AND NOT WIN32)
+  endif()
 
   if(WITH_ROCM)
     target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
@@ -504,4 +504,4 @@ if(WITH_PYTHON)
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
   add_dependencies(paddle_pybind op_function_generator_cmd)
-endif(WITH_PYTHON)
+endif()
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index ab6b8edd52eae..f41e84537a96c 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1417,6 +1417,16 @@ static PyObject* tensor_method_get_non_zero_cols(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_is_dense(TensorObject* self, PyObject* args,
+                                        PyObject* kwargs) {
+  EAGER_TRY
+  if (!self->tensor.defined()) {
+    return ToPyObject(false);
+  }
+  return ToPyObject(self->tensor.is_dense_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_is_sparse(TensorObject* self, PyObject* args,
                                          PyObject* kwargs) {
   EAGER_TRY
@@ -1625,6 +1635,26 @@ static PyObject* tensor__grad_value(TensorObject* self, PyObject* args,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor__unset_fake_empty(TensorObject* self, PyObject* args,
+                                          PyObject* kwargs) {
+  EAGER_TRY
+  paddle::experimental::Tensor* grad =
+      egr::EagerUtils::mutable_grad(self->tensor);
+  PADDLE_ENFORCE_EQ(grad != nullptr, true,
+                    platform::errors::InvalidArgument(
+                        "Detected NULL grad. Please check if you have manually "
+                        "cleared the grad inside autograd_meta"));
+
+  bool is_leaf = egr::egr_utils_api::IsLeafTensor(self->tensor);
+  if (is_leaf) {
+    std::static_pointer_cast<egr::GradNodeAccumulation>(
+        egr::EagerUtils::grad_node(self->tensor))
+        ->SetFakeEmpty(false);
+  }
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 #if defined(PADDLE_WITH_CUDA)
 static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
                                     PyObject* kwargs) {
@@ -1682,6 +1712,8 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"clear_gradient", (PyCFunction)(void (*)(void))tensor_clear_gradient,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_dense", (PyCFunction)(void (*)(void))tensor_method_is_dense,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_share_buffer_to", (PyCFunction)(void (*)(void))tensor__share_buffer_to,
@@ -1779,6 +1811,8 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_grad_value", (PyCFunction)(void (*)(void))tensor__grad_value,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_unset_fake_empty", (PyCFunction)(void (*)(void))tensor__unset_fake_empty,
+     METH_VARARGS | METH_KEYWORDS, NULL},
 #if defined(PADDLE_WITH_CUDA)
     {"_tensor_uva", (PyCFunction)(void (*)(void))tensor_method__uva,
      METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index a0cef6388c13f..55d9f972db872 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -240,6 +240,8 @@ PyObject* pylayer_method_apply(PyObject* cls, PyObject* args,
   PyObject* outputs_tuple = nullptr;
   if (PyTuple_Check(outputs)) {
     outputs_tuple = outputs;
+  } else if (PyList_Check(outputs)) {
+    outputs_tuple = PyList_AsTuple(outputs);
   } else {
     outputs_tuple = PyTuple_New(1);
     Py_INCREF(outputs);
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 7f3dd1ddc38fb..e20db18ea3f53 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -38,6 +38,7 @@ set(PHI_DEPS
     string_tensor
     api_scalar
     api_int_array)
+
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index a1c6989555f20..d60e0245140a4 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -335,11 +335,12 @@ cc_library(
 cc_library(
   api_gen_utils
   SRCS api_gen_utils.cc
-  DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
+  DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor
+       infermeta_utils)
 cc_library(
   phi_data_transform
   SRCS data_transform.cc
-  DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor)
+  DEPS phi_tensor_raw phi tensor)
 cc_library(
   api_custom_impl
   SRCS api_custom_impl.cc
@@ -404,7 +405,7 @@ cc_library(
 cc_library(
   tensor_copy
   SRCS tensor_copy.cc
-  DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils)
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
 cc_library(
   api_scalar
   SRCS scalar.cc
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 91dbafe0cd38d..5e5a9d4833fc2 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -42,10 +42,14 @@ if(TENSORRT_FOUND)
   list(APPEND CUDA_SRCS tensorrt.cc)
 endif()
 
+if(CUSPARSELT_FOUND)
+  list(APPEND CUDA_SRCS cusparseLt.cc)
+endif()
+
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if(CUPTI_FOUND)
   list(APPEND CUDA_SRCS cupti.cc)
-endif(CUPTI_FOUND)
+endif()
 if(WITH_ROCM)
   hip_library(
     phi_dynload_cuda
diff --git a/paddle/phi/backends/dynload/cusparseLt.cc b/paddle/phi/backends/dynload/cusparseLt.cc
new file mode 100644
index 0000000000000..9025a1b82ca3f
--- /dev/null
+++ b/paddle/phi/backends/dynload/cusparseLt.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/cusparseLt.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag cusparselt_dso_flag;
+void *cusparselt_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUSPARSELT_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/cusparseLt.h b/paddle/phi/backends/dynload/cusparseLt.h
new file mode 100644
index 0000000000000..8eecefab5e469
--- /dev/null
+++ b/paddle/phi/backends/dynload/cusparseLt.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <cuda.h>
+#include <cusparseLt.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag cusparselt_dso_flag;
+extern void *cusparselt_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cupti routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP(__name)                    \
+  struct DynLoad__##__name {                                            \
+    template <typename... Args>                                         \
+    cusparseStatus_t operator()(Args... args) {                         \
+      using cusparseltFunc = decltype(&::__name);                       \
+      std::call_once(cusparselt_dso_flag, []() {                        \
+        cusparselt_dso_handle = phi::dynload::GetCusparseLtDsoHandle(); \
+      });                                                               \
+      static void *p_##__name = dlsym(cusparselt_dso_handle, #__name);  \
+      return reinterpret_cast<cusparseltFunc>(p_##__name)(args...);     \
+    }                                                                   \
+  };                                                                    \
+  extern DynLoad__##__name __name
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 11020
+#define CUSPARSELT_ROUTINE_EACH(__macro)       \
+  __macro(cusparseLtInit);                     \
+  __macro(cusparseLtDestroy);                  \
+  __macro(cusparseLtDenseDescriptorInit);      \
+  __macro(cusparseLtStructuredDescriptorInit); \
+  __macro(cusparseLtMatmulDescriptorInit);     \
+  __macro(cusparseLtMatmulDescSetAttribute);   \
+  __macro(cusparseLtMatmulAlgSelectionInit);   \
+  __macro(cusparseLtMatmulAlgSetAttribute);    \
+  __macro(cusparseLtMatmulGetWorkspace);       \
+  __macro(cusparseLtMatmulPlanInit);           \
+  __macro(cusparseLtMatDescriptorDestroy);     \
+  __macro(cusparseLtSpMMACompressedSize2);     \
+  __macro(cusparseLtSpMMACompress2);           \
+  __macro(cusparseLtMatmulSearch);             \
+  __macro(cusparseLtMatmulAlgGetAttribute);    \
+  __macro(cusparseLtMatmulPlanDestroy);        \
+  __macro(cusparseLtMatmul);                   \
+  __macro(cusparseGetErrorString);
+
+CUSPARSELT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP);
+#endif
+#endif
+
+#undef DECLARE_DYNAMIC_LOAD_CUSPARSELT_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 2f35e22a18f82..36a7869595923 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -76,6 +76,8 @@ DEFINE_string(mkl_dir,
 
 DEFINE_string(op_dir, "", "Specify path for loading user-defined op library.");
 
+DEFINE_string(cusparselt_dir, "", "Specify path for loading libcusparseLt.so.");
+
 #ifdef PADDLE_WITH_HIP
 
 DEFINE_string(miopen_dir,
@@ -578,5 +580,18 @@ void* GetMKLRTDsoHandle() {
 #endif
 }
 
+void* GetCusparseLtDsoHandle() {
+// APIs available after CUDA 11.2
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11020
+  return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, "libcusparseLt.so");
+#else
+  std::string warning_msg(
+      "Your CUDA_VERSION less 11.2, not support cusparseLt. "
+      "If you want to use cusparseLt, please upgrade CUDA and rebuild "
+      "PaddlePaddle.");
+  return nullptr;
+#endif
+}
+
 }  // namespace dynload
 }  // namespace phi
diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h
index 942a635b649bc..642535fc50cf3 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.h
+++ b/paddle/phi/backends/dynload/dynamic_loader.h
@@ -45,6 +45,7 @@ void* GetNvtxDsoHandle();
 void* GetCUFFTDsoHandle();
 void* GetMKLRTDsoHandle();
 void* GetROCFFTDsoHandle();
+void* GetCusparseLtDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 05f6c5a545ca0..77f403795b6b3 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -67,6 +67,16 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
 DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
 DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
 
+DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
+                         cudaMemcpyKind::cudaMemcpyHostToDevice,
+                         hipMemcpyKind::hipMemcpyHostToDevice);
+DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
+                         cudaMemcpyKind::cudaMemcpyDeviceToHost,
+                         hipMemcpyKind::hipMemcpyDeviceToHost);
+DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
+                         cudaMemcpyKind::cudaMemcpyDeviceToDevice,
+                         hipMemcpyKind::hipMemcpyDeviceToDevice);
+
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
 
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index f10fc54795ddb..add27da56b59a 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -886,6 +886,58 @@ void DropoutInferMeta(const MetaTensor& x,
   }
 }
 
+void DropoutNdInferMeta(const MetaTensor& x,
+                        const MetaTensor& seed_tensor,
+                        float p,
+                        bool is_test,
+                        const std::string& mode,
+                        int seed,
+                        bool fix_seed,
+                        const std::vector<int>& axis,
+                        MetaTensor* out,
+                        MetaTensor* mask) {
+  auto x_dims = x.dims();
+
+  PADDLE_ENFORCE_LE(
+      axis.size(),
+      x_dims.size(),
+      phi::errors::InvalidArgument(
+          "The length of axis is expected to be less than or equal to the "
+          "dimension size of x. But recieved the length of axis is %d, the "
+          "dimension size of x is %d, x's shape is {%s}.",
+          axis.size(),
+          x_dims.size(),
+          x_dims));
+  for (size_t i = 0; i < axis.size(); ++i) {
+    PADDLE_ENFORCE_EQ(
+        axis[i] >= 0 && axis[i] <= x_dims.size() - 1,
+        true,
+        phi::errors::InvalidArgument(
+            "The %d-th value of axis is expected to be greater ot "
+            "equal to 0 and less than the dimensions of x. But "
+            "recieved axis is {%s}, the dimension size of x is %d.",
+            i,
+            phi::make_ddim(axis),
+            x_dims.size()));
+  }
+
+  out->set_dims(x_dims);
+  out->share_lod(x);
+  out->set_dtype(x.dtype());
+
+  if (mask != nullptr) {
+    std::vector<int64_t> mask_dims(x.dims().size(), 1);
+
+    std::for_each(
+        axis.begin(), axis.end(), [&mask_dims, &x_dims](const int64_t& t) {
+          mask_dims[t] = x_dims[t];
+        });
+
+    mask->set_dims(make_ddim(mask_dims));
+    mask->set_dtype(DataType::UINT8);
+  }
+}
+
 void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
   auto x_dims = x.dims();
   auto x_rank = static_cast<size_t>(x_dims.size());
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 2cd34406fc2d2..9709edf63ccc0 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -145,6 +145,17 @@ void DropoutInferMeta(const MetaTensor& x,
                       MetaTensor* out,
                       MetaTensor* mask);
 
+void DropoutNdInferMeta(const MetaTensor& x,
+                        const MetaTensor& seed_tensor,
+                        float p,
+                        bool is_test,
+                        const std::string& mode,
+                        int seed,
+                        bool fix_seed,
+                        const std::vector<int>& axis,
+                        MetaTensor* out,
+                        MetaTensor* mask);
+
 void ElementwiseInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           MetaTensor* out);
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 0beb7223f212a..bc41a24c44562 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -235,12 +235,12 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) {
   out->set_layout(x.layout());
 }
 
-void CumsumInferMeta(const MetaTensor& x,
-                     int axis,
-                     bool flatten,
-                     bool exclusive,
-                     bool reverse,
-                     MetaTensor* out) {
+void CumInferMeta(const MetaTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  MetaTensor* out) {
   auto x_dims = x.dims();
   if (flatten) {
     out->set_dims(phi::make_ddim({phi::product(x_dims)}));
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a288b9371016f..a0cad3e628e3f 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -60,12 +60,12 @@ void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out);
 
 void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out);
 
-void CumsumInferMeta(const MetaTensor& x,
-                     int axis,
-                     bool flatten,
-                     bool exclusive,
-                     bool reverse,
-                     MetaTensor* out);
+void CumInferMeta(const MetaTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  MetaTensor* out);
 
 void DiagInferMeta(const MetaTensor& x,
                    int offset,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 67795c2a8aa6e..acfeaf21d0742 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -41,124 +41,104 @@ set(COMMON_KERNEL_DEPS
     selected_rows_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta infermeta_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
 
-# [ 2. Kernels that most kernels depend on ]
-# There are a few kernels that are very basic operations, and most of the
-# kernels depend on these kernels.
-set(COMMON_BAISC_KERNELS empty_kernel full_kernel)
-kernel_library(empty_kernel DEPS ${COMMON_KERNEL_DEPS})
-kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
-
-# [ 3. Kernels with special dependencies ]
-# Some kernels depend on some targets that are not commonly used.
-# These targets are not suitable for common dependencies.
-# In this case, you need to manually generate them here.
-set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel
-                     conv_transpose_kernel conv_transpose_grad_kernel)
-set(MANUAL_BUILD_KERNELS
-    ${AUTOTUNE_KERNELS}
-    cross_entropy_kernel
-    adam_kernel
-    adamw_kernel
-    deformable_conv_kernel
-    deformable_conv_grad_kernel
-    eigh_kernel
-    gumbel_softmax_kernel
-    gumbel_softmax_grad_kernel
-    hierarchical_sigmoid_kernel
-    hierarchical_sigmoid_grad_kernel
-    matrix_power_kernel
-    matrix_power_grad_kernel
-    maxout_kernel
-    maxout_grad_kernel
-    pool_kernel
-    put_along_axis_kernel
-    put_along_axis_grad_kernel
-    segment_pool_kernel
-    segment_pool_grad_kernel
-    softmax_kernel
-    softmax_grad_kernel
-    take_along_axis_kernel
-    take_along_axis_grad_kernel
-    triangular_solve_grad_kernel
-    determinant_grad_kernel
-    reduce_sum_kernel
-    reduce_mean_kernel
-    rnn_kernel
-    rnn_grad_kernel
-    warpctc_kernel
-    warpctc_grad_kernel)
-foreach(src ${AUTOTUNE_KERNELS})
-  kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
-endforeach()
-kernel_library(
-  adam_kernel
-  DEPS
-  gflags
-  glog
-  flags
-  ${COMMON_KERNEL_DEPS}
-  selected_rows_functor
-  threadpool
-  jit_kernel_helper)
-kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel)
-kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax
-               cross_entropy)
-kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS}
-               deformable_conv_functor)
-kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               deformable_conv_functor)
-kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               matrix_inverse)
-kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
-kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS}
-               matrix_bit_code)
-kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               matrix_bit_code)
-kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(reduce_sum_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
-kernel_library(reduce_mean_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel)
-kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse)
-kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               matrix_inverse)
-kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
-kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting)
-kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling)
-kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS}
-               gather_scatter_kernel)
-kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               gather_scatter_kernel)
-kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling)
-kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               segment_pooling)
-kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax)
-kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS}
-               gather_scatter_kernel)
-kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               gather_scatter_kernel)
-kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               matrix_reduce)
-kernel_library(rnn_kernel DEPS ${COMMON_KERNEL_DEPS} concat_and_split_functor
-               lstm_compute gru_compute)
-kernel_library(rnn_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               concat_and_split_functor lstm_compute gru_compute)
-kernel_library(warpctc_kernel DEPS ${COMMON_KERNEL_DEPS} phi_dynload_warpctc
-               sequence_padding sequence_scale)
-kernel_library(warpctc_grad_kernel DEPS ${COMMON_KERNEL_DEPS}
-               phi_dynload_warpctc sequence_padding sequence_scale)
-
-# 4. auto parse and build kernel targets by cmake
-register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS
-                 ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS})
+set(COMMON_KERNEL_DEPS
+    ${COMMON_KERNEL_DEPS}
+    threadpool
+    jit_kernel_helper
+    softmax
+    cross_entropy
+    matrix_bit_code
+    lapack_function
+    lstm_compute
+    gru_compute
+    deformable_conv_functor
+    matrix_reduce
+    segment_pooling
+    gather_scatter_kernel
+    pooling
+    maxouting
+    matrix_inverse
+    phi_dynload_warpctc
+    sequence_padding
+    sequence_scale)
 
-# phi sparse kernels
-add_subdirectory(sparse)
-# phi selected_rows kernels
-add_subdirectory(selected_rows)
+set(COMMON_KERNEL_DEPS
+    ${COMMON_KERNEL_DEPS}
+    dense_tensor
+    string_tensor
+    sparse_coo_tensor
+    sparse_csr_tensor
+    kernel_context
+    kernel_factory
+    arg_map_context
+    convert_utils
+    lod_utils
+    custom_kernel
+    string_infermeta
+    utf8proc)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 # For strings kernels
 add_subdirectory(strings)
+
+file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
+file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
+file(GLOB kernel_primitive_h "primitive/*.h")
+file(
+  GLOB
+  kernel_cc
+  "*.cc"
+  "cpu/*.cc"
+  "selected_rows/*.cc"
+  "selected_rows/cpu/*.cc"
+  "sparse/*.cc"
+  "sparse/cpu/*.cc"
+  "strings/*.cc"
+  "strings/cpu/*.cc")
+
+file(
+  GLOB
+  kernel_cu
+  "gpu/*.cu"
+  "gpu/*.cu.cc"
+  "gpudnn/*.cu"
+  "kps/*.cu"
+  "selected_rows/gpu/*.cu"
+  "sparse/gpu/*.cu"
+  "strings/*.cu"
+  "strings/gpu/*.cu")
+
+# file(GLOB kernel_cudnn "gpudnn/*.cu")
+# file(GLOB kernel_kps "kps/*.cu")
+file(GLOB kernel_xpu "xpu/*.cc")
+
+add_library(phi_cpu ${kernel_cc})
+kernel_declare("${kernel_cc}")
+target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS})
+set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu)
+
+if(WITH_GPU OR WITH_ROCM)
+  if(WITH_GPU)
+    add_library(phi_gpu ${kernel_cu})
+  elseif(WITH_ROCM)
+    hip_add_library(phi_gpu STATIC ${kernel_cu})
+  endif()
+  kernel_declare("${kernel_cu}")
+  target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS})
+  set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu phi_gpu)
+endif()
+
+if(WITH_XPU)
+  if(WITH_XPU_KP)
+    file(GLOB kernel_xpu_kps "kps/*.cu")
+    xpu_add_library(phi_xpu STATIC ${kernel_xpu} ${kernel_xpu_kps})
+  else()
+    add_library(phi_xpu ${kernel_xpu})
+  endif()
+  kernel_declare(${kernel_xpu})
+  target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS})
+  set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu phi_xpu)
+endif()
diff --git a/paddle/phi/kernels/cpu/cum_kernel.cc b/paddle/phi/kernels/cpu/cum_kernel.cc
new file mode 100644
index 0000000000000..cd171cc8fc5fc
--- /dev/null
+++ b/paddle/phi/kernels/cpu/cum_kernel.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cum_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+
+namespace phi {
+
+template <typename Device,
+          typename Dim,
+          typename X,
+          typename Out,
+          typename Reducer>
+void ComputeImp(Device d,
+                const Dim& dims,
+                X x,
+                Out out,
+                int axis,
+                bool reverse,
+                bool exclusive,
+                Reducer reducer) {
+  if (!reverse) {
+    out.reshape(dims).device(d) =
+        x.reshape(dims).scan(axis, reducer, exclusive);
+  } else {
+    std::array<bool, Dim::count> rev;
+    rev.fill(false);
+    rev[axis] = reverse;
+    out.reshape(dims).device(d) = x.reshape(dims)
+                                      .reverse(rev)
+                                      .scan(axis, reducer, exclusive)
+                                      .reverse(rev);
+  }
+}
+
+template <typename T, typename Context, typename Reducer>
+void ScanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool flatten,
+                bool exclusive,
+                bool reverse,
+                Reducer reducer,
+                DenseTensor* out) {
+  auto out_dims = out->dims();
+
+  PADDLE_ENFORCE_EQ(
+      axis < out_dims.size() && axis >= (0 - out_dims.size()),
+      true,
+      phi::errors::OutOfRange(
+          "Attr(axis) is out of range, It's expected "
+          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
+          out_dims.size(),
+          out_dims.size() - 1,
+          axis));
+  if (axis < 0) {
+    axis += out_dims.size();
+  }
+
+  dev_ctx.template Alloc<T>(out);
+
+  int pre = 1;
+  int post = 1;
+  int mid = out_dims[axis];
+  for (int i = 0; i < axis; ++i) {
+    pre *= out_dims[i];
+  }
+  for (int i = axis + 1; i < out_dims.size(); ++i) {
+    post *= out_dims[i];
+  }
+
+  auto x0 = EigenVector<T>::Flatten(x);
+  auto out0 = EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+
+  using IndexT = Eigen::DenseIndex;
+  if (pre == 1) {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 1>(mid),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive,
+                 reducer);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 0,
+                 reverse,
+                 exclusive,
+                 reducer);
+    }
+  } else {
+    if (post == 1) {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 2>(pre, mid),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive,
+                 reducer);
+    } else {
+      ComputeImp(place,
+                 Eigen::DSizes<IndexT, 3>(pre, mid, post),
+                 x0,
+                 out0,
+                 /* axis= */ 1,
+                 reverse,
+                 exclusive,
+                 reducer);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  using Reducer = Eigen::internal::SumReducer<T>;
+  auto reducer = Reducer();
+  ScanKernel<T, Context, Reducer>(
+      dev_ctx, x, axis, flatten, exclusive, reverse, reducer, out);
+}
+
+template <typename T>
+struct LogSumExp {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a,
+                                                     const T& b) const {
+    auto mi = Eigen::internal::scalar_min_op<T>()(a, b);
+    auto ma = Eigen::internal::scalar_max_op<T>()(a, b);
+
+    auto sub = Eigen::internal::scalar_difference_op<T>();
+    auto add = Eigen::internal::scalar_sum_op<T>();
+    auto exp = Eigen::internal::scalar_exp_op<T>();
+    auto log1p = Eigen::internal::scalar_log1p_op<T>();
+    auto cmp_lt =
+        Eigen::internal::scalar_cmp_op<T, T, Eigen::internal::cmp_LT>();
+
+    auto logsumexp = add(log1p(exp(sub(mi, ma))), ma);
+    return cmp_lt(ma, Eigen::NumTraits<T>::lowest()) ? ma : logsumexp;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const T& a,
+                                                   const T& b) const {
+    auto mi = Eigen::internal::pmin(a, b);
+    auto ma = Eigen::internal::pmax(a, b);
+    using Eigen::internal::padd;
+    using Eigen::internal::pcmp_lt;
+    using Eigen::internal::pexp;
+    using Eigen::internal::plog1p;
+    using Eigen::internal::pset1;
+    using Eigen::internal::psub;
+
+    auto logsumexp = padd(plog1p(pexp(psub(mi, ma))), ma);
+    return pselect(
+        pcmp_lt(ma, pset1(Eigen::NumTraits<T>::lowest())), ma, logsumexp);
+  }
+};
+
+template <typename T>
+struct LogSumExpReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    LogSumExp<T> logsumexp;
+    *accum = logsumexp(*accum, t);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p,
+                                                          Packet* accum) const {
+    LogSumExp<T> logsumexp;
+    *accum = logsumexp.packetOp(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return Eigen::NumTraits<T>::lowest();
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return Eigen::internal::pset1(initialize());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    return accum;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet
+  finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T
+  finalizeBoth(const T saccum, const Packet& vaccum) const {
+    auto max_reducer = Eigen::internal::MaxReducer<T, Eigen::PropagateNaN>();
+    auto sum_reducer = Eigen::internal::SumReducer<T>();
+    auto exp = Eigen::internal::scalar_exp_op<T>();
+    auto cmp_lt =
+        Eigen::internal::scalar_cmp_op<T, T, Eigen::internal::cmp_LT>();
+    auto log = Eigen::internal::scalar_log_op<T>();
+    auto add = Eigen::internal::scalar_sum_op<T>();
+
+    using Eigen::internal::pexp;
+    using Eigen::internal::psub;
+
+    // `ma = max(x1, ..., xn)`
+    // If the max of all of the `xi` is `-infinity` then the result is
+    // -infinity. If the max is larger than `-infinity` then it's safe to use
+    // for normalization even if the other elements are `-infinity`.
+    //
+    // `logsumexp(x1, ..., xn) = ma + log (exp(x1 - ma) + ... + exp(xn - ma))`
+    auto ma = max_reducer.finalizeBoth(saccum, vaccum);
+    auto logsumexp = add(log(sum_reducer.finalizeBoth(
+                             exp(saccum - ma), pexp(psub(vaccum, pset1(ma))))),
+                         ma);
+    return cmp_lt(ma, Eigen::NumTraits<T>::lowest()) ? initialize() : logsumexp;
+  }
+};
+
+template <typename T, typename Context>
+void LogcumsumexpKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        int axis,
+                        bool flatten,
+                        bool exclusive,
+                        bool reverse,
+                        DenseTensor* out) {
+  using Reducer = LogSumExpReducer<T>;
+  auto reducer = Reducer();
+  ScanKernel<T, Context, Reducer>(
+      dev_ctx, x, axis, flatten, exclusive, reverse, reducer, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(cumsum,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::CumsumKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(
+    logcumsumexp, CPU, ALL_LAYOUT, phi::LogcumsumexpKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cumsum_kernel.cc b/paddle/phi/kernels/cpu/cumsum_kernel.cc
deleted file mode 100644
index d32e18479aae9..0000000000000
--- a/paddle/phi/kernels/cpu/cumsum_kernel.cc
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/kernels/cumsum_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
-namespace phi {
-
-struct CumsumFunctor {
-  template <typename X>
-  const typename X::TensorScanSumOp operator()(X x,
-                                               int axis,
-                                               bool exclusive) const {
-    return x.cumsum(axis, exclusive);
-  }
-};
-
-template <typename Device, typename Dim, typename X, typename Out>
-void ComputeImp(Device d,
-                const Dim& dims,
-                X x,
-                Out out,
-                int axis,
-                bool reverse,
-                bool exclusive) {
-  if (!reverse) {
-    out.reshape(dims).device(d) =
-        CumsumFunctor()(x.reshape(dims), axis, exclusive);
-  } else {
-    std::array<bool, Dim::count> rev;
-    rev.fill(false);
-    rev[axis] = reverse;
-    out.reshape(dims).device(d) =
-        CumsumFunctor()(x.reshape(dims).reverse(rev), axis, exclusive)
-            .reverse(rev);
-  }
-}
-
-template <typename T, typename Context>
-void CumsumKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  int axis,
-                  bool flatten,
-                  bool exclusive,
-                  bool reverse,
-                  DenseTensor* out) {
-  auto out_dims = out->dims();
-
-  PADDLE_ENFORCE_EQ(
-      axis < out_dims.size() && axis >= (0 - out_dims.size()),
-      true,
-      phi::errors::OutOfRange(
-          "Attr(axis) is out of range, It's expected "
-          "to be in range of [-%d, %d]. But received Attr(axis) = %d.",
-          out_dims.size(),
-          out_dims.size() - 1,
-          axis));
-  if (axis < 0) {
-    axis += out_dims.size();
-  }
-
-  dev_ctx.template Alloc<T>(out);
-
-  int pre = 1;
-  int post = 1;
-  int mid = out_dims[axis];
-  for (int i = 0; i < axis; ++i) {
-    pre *= out_dims[i];
-  }
-  for (int i = axis + 1; i < out_dims.size(); ++i) {
-    post *= out_dims[i];
-  }
-
-  auto x0 = EigenVector<T>::Flatten(x);
-  auto out0 = EigenVector<T>::Flatten(*out);
-  auto& place = *dev_ctx.eigen_device();
-
-  using IndexT = Eigen::DenseIndex;
-  if (pre == 1) {
-    if (post == 1) {
-      ComputeImp(place,
-                 Eigen::DSizes<IndexT, 1>(mid),
-                 x0,
-                 out0,
-                 /* axis= */ 0,
-                 reverse,
-                 exclusive);
-    } else {
-      ComputeImp(place,
-                 Eigen::DSizes<IndexT, 2>(mid, post),
-                 x0,
-                 out0,
-                 /* axis= */ 0,
-                 reverse,
-                 exclusive);
-    }
-  } else {
-    if (post == 1) {
-      ComputeImp(place,
-                 Eigen::DSizes<IndexT, 2>(pre, mid),
-                 x0,
-                 out0,
-                 /* axis= */ 1,
-                 reverse,
-                 exclusive);
-    } else {
-      ComputeImp(place,
-                 Eigen::DSizes<IndexT, 3>(pre, mid, post),
-                 x0,
-                 out0,
-                 /* axis= */ 1,
-                 reverse,
-                 exclusive);
-    }
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_KERNEL(cumsum,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::CumsumKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
index db95656421884..42b2834aaffc9 100644
--- a/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_grad_kernel.cc
@@ -21,16 +21,17 @@
 namespace phi {
 
 template <typename T, typename Context>
-void DropoutGradRawKernel(const Context& dev_ctx,
-                          const DenseTensor& mask,
-                          const DenseTensor& out_grad,
-                          float p,
-                          bool is_test,
-                          const std::string& mode,
-                          DenseTensor* x_grad) {
+void DropoutNdGradKernel(const Context& dev_ctx,
+                         const DenseTensor& mask,
+                         const DenseTensor& out_grad,
+                         float p,
+                         bool is_test,
+                         const std::string& mode,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
   auto* grad_x = x_grad;
   auto* grad_y = &out_grad;
-  grad_x->mutable_data<T>(dev_ctx.GetPlace());
+  dev_ctx.template Alloc<T>(grad_x);
 
   auto dX = EigenVector<T>::Flatten(*grad_x);
   auto dY = EigenVector<T>::Flatten(*grad_y);
@@ -44,19 +45,41 @@ void DropoutGradRawKernel(const Context& dev_ctx,
       dX.device(place) = dY * static_cast<T>(1.0f - p);
     }
   } else {
+    std::vector<int64_t> out_dims = phi::vectorize(out_grad.dims());
     auto M = EigenVector<uint8_t>::Flatten(mask);
     if (dropout_implementation == "upscale_in_train") {
       if (p == 1.0f) {
         dX.device(place) = static_cast<T>(0) * dY;
       } else {
-        dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+        if (axis.empty()) {
+          dX.device(place) = dY * M.cast<T>() / static_cast<T>(1.0f - p);
+        } else {
+          dX.device(place) =
+              dY * M.broadcast(out_dims).cast<T>() / static_cast<T>(1.0f - p);
+        }
       }
     } else {
-      dX.device(place) = dY * M.cast<T>();
+      if (axis.empty()) {
+        dX.device(place) = dY * M.cast<T>();
+      } else {
+        dX.device(place) = dY * M.broadcast(out_dims).cast<T>();
+      }
     }
   }
 }
 
+template <typename T, typename Context>
+void DropoutGradRawKernel(const Context& dev_ctx,
+                          const DenseTensor& mask,
+                          const DenseTensor& out_grad,
+                          float p,
+                          bool is_test,
+                          const std::string& mode,
+                          DenseTensor* x_grad) {
+  DropoutNdGradKernel<T, Context>(
+      dev_ctx, mask, out_grad, p, is_test, mode, {}, x_grad);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(dropout_grad,
@@ -66,3 +89,7 @@ PD_REGISTER_KERNEL(dropout_grad,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(
+    dropout_nd_grad, CPU, ALL_LAYOUT, phi::DropoutNdGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/dropout_kernel.cc b/paddle/phi/kernels/cpu/dropout_kernel.cc
index d9c02eff0106f..d3ca21cfe33b9 100644
--- a/paddle/phi/kernels/cpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/cpu/dropout_kernel.cc
@@ -17,10 +17,34 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
 
+template <typename T, typename Context>
+void ComputeDropoutInference(const Context& ctx,
+                             const DenseTensor& x,
+                             float dropout_prob,
+                             bool upscale_in_train,
+                             DenseTensor* y) {
+  if (upscale_in_train) {
+    const auto* X_data = x.data<T>();
+    T* Y_data = ctx.template Alloc<T>(y);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int i = 0; i < x.numel(); i++) {
+      Y_data[i] = X_data[i];
+    }
+  } else {
+    auto X = EigenMatrix<T>::Reshape(x, 1);
+    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+    auto& place = *ctx.eigen_device();
+    Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+  }
+}
+
 template <typename T, typename Context>
 void DropoutRawKernel(const Context& dev_ctx,
                       const DenseTensor& x,
@@ -34,13 +58,13 @@ void DropoutRawKernel(const Context& dev_ctx,
                       DenseTensor* mask) {
   auto* y = out;
   const auto* x_data = x.data<T>();
-  auto* y_data = y->mutable_data<T>(dev_ctx.GetPlace());
+  T* y_data = dev_ctx.template Alloc<T>(y);
   float dropout_prob = p;
 
   auto& dropout_implementation = mode;
   bool upscale_in_train = (dropout_implementation == "upscale_in_train");
   if (!is_test) {
-    auto* mask_data = mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+    auto* mask_data = dev_ctx.template Alloc<uint8_t>(mask);
     size_t size = phi::product(mask->dims());
 
     // Special case when dropout_prob is 1.0
@@ -76,21 +100,92 @@ void DropoutRawKernel(const Context& dev_ctx,
       }
     }
   } else {
-    if (upscale_in_train) {
-      const auto* X_data = x.data<T>();
-      auto* Y_data = y->mutable_data<T>(dev_ctx.GetPlace());
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-      for (int i = 0; i < x.numel(); i++) {
-        Y_data[i] = X_data[i];
-      }
+    ComputeDropoutInference<T, Context>(
+        dev_ctx, x, dropout_prob, upscale_in_train, y);
+  }
+}
+
+template <typename T, typename Context>
+void DropoutNdKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& seed_tensor,
+                     float p,
+                     bool is_test,
+                     const std::string& mode,
+                     int seed,
+                     bool fix_seed,
+                     const std::vector<int>& axis,
+                     DenseTensor* out,
+                     DenseTensor* mask) {
+  auto* y = out;
+  const auto* x_data = x.data<T>();
+  T* y_data = dev_ctx.template Alloc<T>(y);
+  float dropout_prob = p;
+
+  auto& dropout_implementation = mode;
+  bool upscale_in_train = (dropout_implementation == "upscale_in_train");
+  if (!is_test) {
+    DenseTensor t_mask;
+    t_mask.Resize(mask->dims());
+    T* t_mask_data = dev_ctx.template Alloc<T>(&t_mask);
+    auto* mask_data = dev_ctx.template Alloc<uint8_t>(mask);
+    size_t size = phi::product(mask->dims());
+
+    // Special case when dropout_prob is 1.0
+    if (dropout_prob == 1.0f) {
+      std::memset(y_data, 0, size * sizeof(*y_data));            // NOLINT
+      std::memset(t_mask_data, 0, size * sizeof(*t_mask_data));  // NOLINT
+      std::memset(mask_data, 0, size * sizeof(*mask_data));      // NOLINT
+      return;
+    }
+    // std::minstd_rand engine;
+    // NOTE: fixed seed should only be used in unittest or for debug.
+    // Guarantee to use random seed in training.
+    int seed_data = 0;
+    if (seed_tensor.get_ptr() != nullptr) {
+      seed_data = *(seed_tensor->data<int>());
     } else {
-      auto X = EigenMatrix<T>::Reshape(x, 1);
-      auto Y = EigenMatrix<T>::Reshape(*y, 1);
-      auto& place = *dev_ctx.eigen_device();
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      seed_data = fix_seed ? seed : 0;
     }
+    auto engine = paddle::framework::GetCPURandomEngine(seed_data);
+
+    std::uniform_real_distribution<float> dist(0, 1);
+
+    for (size_t i = 0; i < size; ++i) {
+      if (dist(*engine) < dropout_prob) {
+        t_mask_data[i] = 0;
+        mask_data[i] = 0;
+      } else {
+        t_mask_data[i] = 1;
+        mask_data[i] = 1;
+      }
+    }
+    auto& x_dims = x.dims();
+    DenseTensor broadcast_mask;
+    broadcast_mask.Resize(x_dims);
+    T* broadcast_mask_data = dev_ctx.template Alloc<T>(&broadcast_mask);
+
+    std::vector<int64_t> mask_bst_dims_vec;
+    for (int i = 0; i < x_dims.size(); i++) {
+      mask_bst_dims_vec.emplace_back(x_dims[i]);
+    }
+    IntArray mask_bst_dims(mask_bst_dims_vec);
+    ExpandKernel<T, Context>(dev_ctx, t_mask, mask_bst_dims, &broadcast_mask);
+
+    for (auto i = 0; i < x.numel(); i++) {
+      if (broadcast_mask_data[i] == static_cast<T>(1)) {
+        if (upscale_in_train) {
+          y_data[i] = x_data[i] / static_cast<T>(1.0f - dropout_prob);
+        } else {
+          y_data[i] = x_data[i];
+        }
+      } else {
+        y_data[i] = 0;
+      }
+    }
+  } else {
+    ComputeDropoutInference<T, Context>(
+        dev_ctx, x, dropout_prob, upscale_in_train, y);
   }
 }
 
@@ -103,3 +198,6 @@ PD_REGISTER_KERNEL(dropout,
                    float,
                    double,
                    phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(
+    dropout_nd, CPU, ALL_LAYOUT, phi::DropoutNdKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/logcumsumexp_grad_kernel.cc b/paddle/phi/kernels/cpu/logcumsumexp_grad_kernel.cc
new file mode 100644
index 0000000000000..17f28b411bcdd
--- /dev/null
+++ b/paddle/phi/kernels/cpu/logcumsumexp_grad_kernel.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logcumsumexp_grad_kernel.h"
+
+#include <limits>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/logcumsumexp_grad_impl.h"
+
+PD_REGISTER_KERNEL(logcumsumexp_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::LogcumsumexpGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cum_kernel.h
similarity index 75%
rename from paddle/phi/kernels/cumsum_kernel.h
rename to paddle/phi/kernels/cum_kernel.h
index f105c94d559d8..38cdbd7787baf 100644
--- a/paddle/phi/kernels/cumsum_kernel.h
+++ b/paddle/phi/kernels/cum_kernel.h
@@ -27,4 +27,13 @@ void CumsumKernel(const Context& dev_ctx,
                   bool reverse,
                   DenseTensor* out);
 
+template <typename T, typename Context>
+void LogcumsumexpKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        int axis,
+                        bool flatten,
+                        bool exclusive,
+                        bool reverse,
+                        DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/dropout_grad_kernel.h b/paddle/phi/kernels/dropout_grad_kernel.h
index ae3f82056632d..d8d5363ad59b7 100644
--- a/paddle/phi/kernels/dropout_grad_kernel.h
+++ b/paddle/phi/kernels/dropout_grad_kernel.h
@@ -28,4 +28,14 @@ void DropoutGradRawKernel(const Context& dev_ctx,
                           const std::string& mode,
                           DenseTensor* x_grad);
 
+template <typename T, typename Context>
+void DropoutNdGradKernel(const Context& dev_ctx,
+                         const DenseTensor& mask,
+                         const DenseTensor& out_grad,
+                         float p,
+                         bool is_test,
+                         const std::string& mode,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/dropout_kernel.h b/paddle/phi/kernels/dropout_kernel.h
index 6febcd78e1107..cba8160058e99 100644
--- a/paddle/phi/kernels/dropout_kernel.h
+++ b/paddle/phi/kernels/dropout_kernel.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -31,4 +32,17 @@ void DropoutRawKernel(const Context& dev_ctx,
                       DenseTensor* out,
                       DenseTensor* mask);
 
+template <typename T, typename Context>
+void DropoutNdKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& seed_tensor,
+                     float p,
+                     bool is_test,
+                     const std::string& mode,
+                     int seed,
+                     bool fix_seed,
+                     const std::vector<int>& axis,
+                     DenseTensor* out,
+                     DenseTensor* mask);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 1c9fbffa2ac19..22dba8297d65b 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -276,7 +276,10 @@ struct ConcatFunctor<phi::GPUContext, T> {
     int64_t out_row = in_row, out_col = 0;
 
     int inputs_col_num = in_num + 1;
-    paddle::memory::AllocationPtr data_alloc, col_alloc;
+    std::vector<const T*> inputs_data_vec(in_num);
+    std::vector<int64_t> inputs_col_vec(inputs_col_num);
+    const T** inputs_data = inputs_data_vec.data();
+    int64_t* inputs_col = inputs_col_vec.data();
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -286,22 +289,16 @@ struct ConcatFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        in_num * sizeof(T*));
+    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                       inputs_col_num * sizeof(int));
-#else
-    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
-    // allocator.
-    data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
-                                       in_num * sizeof(T*));
-    col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
-                                      (inputs_col_num) * sizeof(int64_t));
+    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 #endif
-    const T** inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-    int64_t* inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 
     inputs_col[0] = 0;
     bool has_same_shape = true;
@@ -390,6 +387,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
           output->data<T>());
     }
 
+#ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -403,6 +401,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           col_alloc_released);
     });
+#endif
   }
 };
 
@@ -433,7 +432,10 @@ class SplitFunctor<phi::GPUContext, T> {
     bool has_same_shape = true;
 
     int outputs_cols_num = o_num + 1;
-    paddle::memory::AllocationPtr data_alloc, cols_alloc;
+    std::vector<T*> outputs_data_vec(o_num);
+    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
+    T** outputs_data = outputs_data_vec.data();
+    int64_t* outputs_cols = outputs_cols_vec.data();
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -443,22 +445,16 @@ class SplitFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        o_num * sizeof(T*));
+    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        (outputs_cols_num) * sizeof(int64_t));
-#else
-    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
-    // allocator.
-    data_alloc =
-        paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*));
-    cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
-                                       (outputs_cols_num) * sizeof(int64_t));
+    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 #endif
-    T** outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-    int64_t* outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -552,6 +548,7 @@ class SplitFunctor<phi::GPUContext, T> {
           dev_out_gpu_data);
     }
 
+#ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -563,6 +560,7 @@ class SplitFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           cols_alloc_released);
     });
+#endif
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/strided_slice.h b/paddle/phi/kernels/funcs/strided_slice.h
index 8eebfc7caa795..c39a9694e18e5 100644
--- a/paddle/phi/kernels/funcs/strided_slice.h
+++ b/paddle/phi/kernels/funcs/strided_slice.h
@@ -74,10 +74,14 @@ static void StridedSliceOutDims(const std::vector<int64_t>& starts,
 
     if (start_index < 0) {
       start_index = start_index + axis_size;
+      start_index = std::max<int64_t>(start_index, 0);
     }
     if (end_index < 0) {
       if (!(end_index == -1 && stride_index < 0)) {  // skip None stop condition
         end_index = end_index + axis_size;
+        if (end_index < 0) {
+          end_index = 0;
+        }
       }
     }
 
diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
similarity index 79%
rename from paddle/phi/kernels/gpu/cumsum_kernel.cu
rename to paddle/phi/kernels/gpu/cum_kernel.cu
index 460aa37f8f995..ad86fd9ba49df 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -17,7 +17,7 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
-#include "paddle/phi/kernels/cumsum_kernel.h"
+#include "paddle/phi/kernels/cum_kernel.h"
 #ifdef __NVCC__
 #include <cub/cub.cuh>
 #endif
@@ -82,19 +82,20 @@ __global__ void MatrixRowReverse(const T* matrix_data,
   }
 }
 
-template <typename T>
+template <typename T, typename Op>
 struct BlockPrefixCallbackOp {
   // Running prefix
-  T running_total;
-  // Constructor
-  __device__ BlockPrefixCallbackOp(T running_total)
-      : running_total(running_total) {}
+  T running_total_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T running_total, Op op)
+      : running_total_(running_total), op_(op) {}
+
   // Callback operator to be entered by the first warp of threads in the block.
-  // Thread-0 is responsible for returning a value for seeding the block-wide
-  // scan.
+  // tid 0 is responsible for returning a value for seeding the block-wide scan.
   __device__ T operator()(T block_aggregate) {
-    T old_prefix = running_total;
-    running_total = old_prefix + block_aggregate;
+    T old_prefix = running_total_;
+    running_total_ = op_(old_prefix, block_aggregate);
     return old_prefix;
   }
 };
@@ -129,13 +130,36 @@ __global__ void MatrixTranspose(T* odata,
   }
 }
 
-template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD>
+struct LogAddExp {
+  template <typename T>
+  __host__ __device__ __forceinline__ T operator()(const T& a,
+                                                   const T& b) const {
+    return std::log(1 + std::exp(std::min(a, b) - std::max(a, b))) +
+           std::max(a, b);
+  }
+};
+
+template <typename T, typename op>
+struct Identity;
+
+template <typename T>
+struct Identity<T, cub::Sum> {
+  static constexpr T value = 0;
+};
+
+template <typename T>
+struct Identity<T, LogAddExp> {
+  static constexpr T value = std::numeric_limits<T>::lowest();
+};
+
+template <typename T, int BLOCK_THREADS, int ITEMS_PER_THREAD, typename Op>
 __global__ void BlockScanKernel(T* d_out,
                                 const T* d_in,
                                 int inner_size,
                                 int outer_size,
                                 int scan_size,
-                                bool exclusive) {
+                                bool exclusive,
+                                Op op) {
   // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
   typedef cub::
       BlockLoad<T, BLOCK_THREADS, ITEMS_PER_THREAD, cub::BLOCK_LOAD_TRANSPOSE>
@@ -154,7 +178,7 @@ __global__ void BlockScanKernel(T* d_out,
   int bx = blockIdx.x;
   int by = blockIdx.y;
 
-  BlockPrefixCallbackOp<T> prefix_op(0);
+  BlockPrefixCallbackOp<T, Op> prefix_op(Identity<T, Op>::value, op);
   T block_aggregate = static_cast<T>(0);
 
   // Obtain this block's segment of consecutive keys (blocked across threads)
@@ -176,12 +200,11 @@ __global__ void BlockScanKernel(T* d_out,
 
     __syncthreads();
     if (exclusive) {
-      T init_value = static_cast<T>(0);
       BlockScanT(temp_storage.scan)
-          .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+          .ExclusiveScan(thread_keys, thread_keys, op, prefix_op);
     } else {
       BlockScanT(temp_storage.scan)
-          .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op);
+          .InclusiveScan(thread_keys, thread_keys, op, prefix_op);
     }
     __syncthreads();
 
@@ -190,14 +213,15 @@ __global__ void BlockScanKernel(T* d_out,
   }
 }
 
-template <typename T, typename Context>
-void CumsumKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  int axis,
-                  bool flatten,
-                  bool exclusive,
-                  bool reverse,
-                  DenseTensor* out) {
+template <typename T, typename Context, typename Op>
+void ScanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                int axis,
+                bool flatten,
+                bool exclusive,
+                bool reverse,
+                Op op,
+                DenseTensor* out) {
   auto out_dims = out->dims();
   auto size = x.numel();
 
@@ -219,7 +243,7 @@ void CumsumKernel(const Context& dev_ctx,
 
   // Use thrust for parallel acceleration when the input size is equal to the
   // length of the ‘axis’ dimension.
-  if (size == out_dims[axis]) {
+  if (std::is_same<Op, cub::Sum>::value && size == out_dims[axis]) {
 #ifdef __HIPCC__
     const auto& policy = thrust::hip::par.on(dev_ctx.stream());
 #else
@@ -247,6 +271,7 @@ void CumsumKernel(const Context& dev_ctx,
     return;
   }
 
+
   size_t height = 1;
   size_t width = 1;
   for (size_t i = 0; i <= axis; i++) {
@@ -299,17 +324,18 @@ void CumsumKernel(const Context& dev_ctx,
     }
   }
   if (!transpose && !reverse) {
-    BlockScanKernel<T, 128, 4><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
-        out_data, in_data, outer_size, inner_size, scan_size, exclusive);
+    BlockScanKernel<T, 128, 4, Op><<<scan_grid, 128, 0, dev_ctx.stream()>>>(
+        out_data, in_data, outer_size, inner_size, scan_size, exclusive, op);
 
   } else {
-    BlockScanKernel<T, 128, 4>
+    BlockScanKernel<T, 128, 4, Op>
         <<<scan_grid, 128, 0, dev_ctx.stream()>>>(next_out_data,
                                                   next_in_data,
                                                   outer_size,
                                                   inner_size,
                                                   scan_size,
-                                                  exclusive);
+                                                  exclusive,
+                                                  op);
   }
   swap_ptr(next_in_data, next_out_data);
   if (reverse) {
@@ -325,6 +351,34 @@ void CumsumKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void CumsumKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  int axis,
+                  bool flatten,
+                  bool exclusive,
+                  bool reverse,
+                  DenseTensor* out) {
+  using Op = cub::Sum;
+  auto op = Op();
+  ScanKernel<T, Context, Op>(
+      dev_ctx, x, axis, flatten, exclusive, reverse, op, out);
+}
+
+template <typename T, typename Context>
+void LogcumsumexpKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        int axis,
+                        bool flatten,
+                        bool exclusive,
+                        bool reverse,
+                        DenseTensor* out) {
+  using Op = LogAddExp;
+  auto op = Op();
+  ScanKernel<T, Context, Op>(
+      dev_ctx, x, axis, flatten, exclusive, reverse, op, out);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(cumsum,
@@ -336,3 +390,10 @@ PD_REGISTER_KERNEL(cumsum,
                    int16_t,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(logcumsumexp,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogcumsumexpKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
index b27029fe863fa..1eea13a5a226b 100644
--- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu
@@ -27,10 +27,25 @@ void DropoutGradRawKernel(const Context& dev_ctx,
                           bool is_test,
                           const std::string& mode,
                           DenseTensor* x_grad) {
+  bool upscale_in_train = (mode == "upscale_in_train");
   x_grad->mutable_data<T>(dev_ctx.GetPlace());
-  auto size = x_grad->numel();
   paddle::operators::DropoutGradGPUKernelDriver<T>(
-      dev_ctx, mode, p, out_grad, mask, size, x_grad, is_test);
+      dev_ctx, is_test, p, upscale_in_train, out_grad, mask, x_grad, false);
+}
+
+template <typename T, typename Context>
+void DropoutNdGradKernel(const Context& dev_ctx,
+                         const DenseTensor& mask,
+                         const DenseTensor& out_grad,
+                         float p,
+                         bool is_test,
+                         const std::string& mode,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
+  bool upscale_in_train = (mode == "upscale_in_train");
+  dev_ctx.template Alloc<T>(x_grad);
+  paddle::operators::DropoutGradGPUKernelDriver<T>(
+      dev_ctx, is_test, p, upscale_in_train, out_grad, mask, x_grad, true);
 }
 
 }  // namespace phi
@@ -43,3 +58,12 @@ PD_REGISTER_KERNEL(dropout_grad,
                    double,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(dropout_nd_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutNdGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu
index 8ae3dd25cc8f6..3811440be7511 100644
--- a/paddle/phi/kernels/gpu/dropout_kernel.cu
+++ b/paddle/phi/kernels/gpu/dropout_kernel.cu
@@ -30,22 +30,48 @@ void DropoutRawKernel(const Context& dev_ctx,
                       bool fix_seed,
                       DenseTensor* out,
                       DenseTensor* mask) {
-  out->mutable_data<T>(dev_ctx.GetPlace());
-  float dropout_prob = p;
   bool upscale_in_train = (mode == "upscale_in_train");
+  out->mutable_data<T>(dev_ctx.GetPlace());
   mask->mutable_data<uint8_t>(dev_ctx.GetPlace());
+  paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
+                                                 is_test,
+                                                 p,
+                                                 upscale_in_train,
+                                                 fix_seed,
+                                                 seed,
+                                                 x,
+                                                 seed_tensor.get_ptr(),
+                                                 mask,
+                                                 out,
+                                                 false);
+}
 
+template <typename T, typename Context>
+void DropoutNdKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& seed_tensor,
+                     float p,
+                     bool is_test,
+                     const std::string& mode,
+                     int seed,
+                     bool fix_seed,
+                     const std::vector<int>& axis,
+                     DenseTensor* out,
+                     DenseTensor* mask) {
+  bool upscale_in_train = (mode == "upscale_in_train");
+  dev_ctx.template Alloc<T>(out);
+  dev_ctx.template Alloc<uint8_t>(mask);
   paddle::operators::DropoutFwGPUKernelDriver<T>(dev_ctx,
                                                  is_test,
-                                                 mode,
-                                                 dropout_prob,
+                                                 p,
                                                  upscale_in_train,
                                                  fix_seed,
                                                  seed,
                                                  x,
                                                  seed_tensor.get_ptr(),
                                                  mask,
-                                                 out);
+                                                 out,
+                                                 true);
 }
 
 }  // namespace phi
@@ -58,3 +84,12 @@ PD_REGISTER_KERNEL(dropout,
                    double,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(dropout_nd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::DropoutNdKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
new file mode 100644
index 0000000000000..43744210e32b7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/logcumsumexp_grad_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/logcumsumexp_grad_impl.h"
+#include "paddle/phi/kernels/logcumsumexp_grad_kernel.h"
+
+PD_REGISTER_KERNEL(logcumsumexp_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogcumsumexpGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index e5552f28f88e3..13d85ed1d36bd 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -663,13 +663,13 @@ void ConvCudnnGradGradKernel(
 }
 
 template <typename T, typename Context>
-void DepthwiseConvCudnnGradGradKernel(
+void DepthwiseConvDoubleGradGPUDNNKernel(
     const Context& ctx,
-    const paddle::optional<DenseTensor>& input_grad_grad,
-    const paddle::optional<DenseTensor>& filter_grad_grad,
-    const DenseTensor& out_grad,
     const DenseTensor& input,
     const DenseTensor& filter,
+    const DenseTensor& out_grad,
+    const paddle::optional<DenseTensor>& input_grad_grad,
+    const paddle::optional<DenseTensor>& filter_grad_grad,
     const std::vector<int>& strides,
     const std::vector<int>& paddings_t,
     const std::string& padding_algorithm,
@@ -680,9 +680,9 @@ void DepthwiseConvCudnnGradGradKernel(
     int workspace_size_MB,
     bool exhaustive_search_t,
     bool fuse_relu,
-    DenseTensor* out_grad_grad,
     DenseTensor* input_grad,
-    DenseTensor* filter_grad) {
+    DenseTensor* filter_grad,
+    DenseTensor* out_grad_grad) {
   ConvCudnnGradGradKernel<T>(ctx,
                              input,
                              filter,
@@ -763,7 +763,7 @@ PD_REGISTER_KERNEL(conv3d_grad_grad,
 PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::DepthwiseConvCudnnGradGradKernel,
+                   phi::DepthwiseConvDoubleGradGPUDNNKernel,
                    float,
                    phi::dtype::float16) {}
 #else
@@ -789,7 +789,7 @@ PD_REGISTER_KERNEL(conv3d_grad_grad,
 PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::DepthwiseConvCudnnGradGradKernel,
+                   phi::DepthwiseConvDoubleGradGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16,
@@ -816,7 +816,7 @@ PD_REGISTER_KERNEL(conv3d_grad_grad,
 PD_REGISTER_KERNEL(depthwise_conv2d_grad_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::DepthwiseConvCudnnGradGradKernel,
+                   phi::DepthwiseConvDoubleGradGPUDNNKernel,
                    float,
                    double,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/logcumsumexp_grad_impl.h b/paddle/phi/kernels/impl/logcumsumexp_grad_impl.h
new file mode 100644
index 0000000000000..602f2248902cc
--- /dev/null
+++ b/paddle/phi/kernels/impl/logcumsumexp_grad_impl.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cum_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct LogGradPositiveFunctor {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kMin = std::numeric_limits<T>::lowest();
+    return x > 0 ? std::log(x) : kMin;
+  }
+};
+
+template <typename T>
+struct LogGradNegativeFunctor {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kMin = std::numeric_limits<T>::lowest();
+    return x < 0 ? std::log(-x) : kMin;
+  }
+};
+
+template <typename T, typename Context>
+void LogcumsumexpGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& d_out,
+                            int axis,
+                            bool flatten,
+                            bool exclusive,
+                            bool reverse,
+                            DenseTensor* d_x) {
+  reverse = !reverse;
+  dev_ctx.template Alloc<T>(d_x);
+
+  auto eigen_x = EigenVector<T>::Flatten(x);
+  auto eigen_out = EigenVector<T>::Flatten(out);
+  auto eigen_d_out = EigenVector<T>::Flatten(d_out);
+  auto& place = *dev_ctx.eigen_device();
+
+  DenseTensor output_pos;
+  output_pos.Resize(d_out.dims());
+  dev_ctx.template Alloc<T>(&output_pos);
+  auto eigen_output_pos = EigenVector<T>::Flatten(output_pos);
+  DenseTensor output_neg;
+  output_neg.Resize(d_out.dims());
+  dev_ctx.template Alloc<T>(&output_neg);
+  auto eigen_output_neg = EigenVector<T>::Flatten(output_neg);
+  DenseTensor tmp;
+  tmp.Resize(d_out.dims());
+  dev_ctx.template Alloc<T>(&tmp);
+  auto eigen_tmp = EigenVector<T>::Flatten(tmp);
+
+  eigen_tmp.device(place) =
+      eigen_d_out.unaryExpr(LogGradPositiveFunctor<T>()) - eigen_out;
+  LogcumsumexpKernel<T, Context>(
+      dev_ctx, tmp, axis, flatten, exclusive, reverse, &output_pos);
+  eigen_output_pos.device(place) = (eigen_output_pos + eigen_x).exp();
+
+  eigen_tmp.device(place) =
+      eigen_d_out.unaryExpr(LogGradNegativeFunctor<T>()) - eigen_out;
+  LogcumsumexpKernel<T, Context>(
+      dev_ctx, tmp, axis, flatten, exclusive, reverse, &output_neg);
+  eigen_output_neg.device(place) = (eigen_output_neg + eigen_x).exp();
+
+  auto eigen_d_x = EigenVector<T>::Flatten(*d_x);
+  eigen_d_x.device(place) = eigen_output_pos - eigen_output_neg;
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/logcumsumexp_grad_kernel.h b/paddle/phi/kernels/logcumsumexp_grad_kernel.h
new file mode 100644
index 0000000000000..e78a79550657e
--- /dev/null
+++ b/paddle/phi/kernels/logcumsumexp_grad_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void LogcumsumexpGradKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& out,
+                            const DenseTensor& d_out,
+                            int axis,
+                            bool flatten,
+                            bool exclusive,
+                            bool reverse,
+                            DenseTensor* d_x);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 28b1b3368ed42..57bc85069a6eb 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 
 namespace phi {
@@ -68,20 +69,23 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
                             SparseCooTensor* out) {
   const T* x_data = x.data<T>();
   const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_LE(sparse_dim,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "sparse_dim must be less than the size of x.dims()"));
+  PADDLE_ENFORCE_GT(
+      sparse_dim, 0, phi::errors::InvalidArgument("sparse_dim must be >0"));
 
   int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim);
 
-  const auto place = dev_ctx.GetPlace();
   const auto values_dims =
       phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
-  DenseTensorMeta indices_meta(DataType::INT64,
-                               {sparse_dim, static_cast<int64_t>(non_zero_num)},
-                               DataLayout::NCHW);
   DenseTensorMeta values_meta(x.meta().dtype, values_dims, x.meta().layout);
-  phi::DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor indices =
+      phi::Empty<int64_t>(dev_ctx, {sparse_dim, non_zero_num});
   phi::DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta));
-  int64_t* indices_data = indices.mutable_data<int64_t>(place);
-  T* values_data = values.mutable_data<T>(place);
+  int64_t* indices_data = indices.data<int64_t>();
+  T* values_data = values.data<T>();
 
   auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
   const int rows = dims_2d[0];
@@ -102,36 +106,32 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   out->SetMember(indices, values, x_dims, true);
 }
 
-template <typename T, typename Context>
-void SparseCsrToCooKernel(const Context& dev_ctx,
-                          const SparseCsrTensor& x,
-                          SparseCooTensor* out) {
+template <typename T, typename IntT>
+void SparseCsrToCooCPUKernel(const CPUContext& dev_ctx,
+                             const SparseCsrTensor& x,
+                             SparseCooTensor* out) {
   const DDim& x_dims = x.dims();
   const int64_t non_zero_num = x.non_zero_cols().numel();
   const auto& csr_crows = x.non_zero_crows();
   const auto& csr_cols = x.non_zero_cols();
   const auto& csr_values = x.non_zero_elements();
-  const int64_t* csr_crows_data = csr_crows.data<int64_t>();
-  const int64_t* csr_cols_data = csr_cols.data<int64_t>();
+  const IntT* csr_crows_data = csr_crows.data<IntT>();
+  const IntT* csr_cols_data = csr_cols.data<IntT>();
   const T* csr_values_data = csr_values.data<T>();
 
   int64_t sparse_dim = 2;
   if (x_dims.size() == 3) {
     sparse_dim = 3;
   }
-  const auto place = dev_ctx.GetPlace();
-  DenseTensorMeta indices_meta(
-      DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {non_zero_num}, x.non_zero_elements().layout());
-  phi::DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta));
-  phi::DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta));
-  int64_t* coo_indices = indices.mutable_data<int64_t>(place);
-  int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
-  int64_t* coo_rows_data =
+  phi::DenseTensor indices =
+      phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
+  phi::DenseTensor values = phi::Empty<T>(dev_ctx, {non_zero_num});
+  IntT* coo_indices = indices.data<IntT>();
+  IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
+  IntT* coo_rows_data =
       x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
-  int64_t* coo_cols_data = coo_rows_data + non_zero_num;
-  T* coo_values_data = values.mutable_data<T>(place);
+  IntT* coo_cols_data = coo_rows_data + non_zero_num;
+  T* coo_values_data = values.data<T>();
 
   int batch = x_dims.size() == 2 ? 1 : x_dims[0];
   int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
@@ -139,7 +139,7 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
   int index = 0;
   for (int b = 0; b < batch; b++) {
     for (int i = 0; i < rows; i++) {
-      for (int j = csr_crows_data[b * (rows + 1) + i];
+      for (IntT j = csr_crows_data[b * (rows + 1) + i];
            j < csr_crows_data[b * (rows + 1) + i + 1];
            j++) {
         coo_rows_data[index] = i;
@@ -151,15 +151,25 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
     }
   }
 
-  memcpy(coo_cols_data, csr_cols_data, sizeof(int64_t) * non_zero_num);
+  memcpy(coo_cols_data, csr_cols_data, sizeof(IntT) * non_zero_num);
   memcpy(coo_values_data, csr_values_data, sizeof(T) * non_zero_num);
   out->SetMember(indices, values, x_dims, true);
 }
 
 template <typename T, typename Context>
-void SparseCooToCsrKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          SparseCsrTensor* out) {
+void SparseCsrToCooKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_crows().dtype(), "SparseCsrToCooCPUKernel", ([&] {
+        SparseCsrToCooCPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+template <typename T, typename IntT>
+void SparseCooToCsrCPUKernel(const CPUContext& dev_ctx,
+                             const SparseCooTensor& x,
+                             SparseCsrTensor* out) {
   const auto& x_dims = x.dims();
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
@@ -174,11 +184,11 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
 
   phi::DenseTensor non_zero_crows;
   non_zero_crows.Resize({batchs * (rows + 1)});
-  int64_t* csr_crows_data = dev_ctx.template Alloc<int64_t>(&non_zero_crows);
+  IntT* csr_crows_data = dev_ctx.template Alloc<IntT>(&non_zero_crows);
 
   phi::DenseTensor non_zero_cols;
   non_zero_cols.Resize({non_zero_num});
-  int64_t* csr_cols_data = dev_ctx.template Alloc<int64_t>(&non_zero_cols);
+  IntT* csr_cols_data = dev_ctx.template Alloc<IntT>(&non_zero_cols);
 
   phi::DenseTensor non_zero_elements;
   non_zero_elements.Resize({non_zero_num});
@@ -186,16 +196,12 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
 
   const auto& coo_indices = x.non_zero_indices();
   const auto& coo_values = x.non_zero_elements();
-  const int64_t* batchs_ptr = coo_indices.data<int64_t>();
-  const int64_t* coo_rows_data =
+  const IntT* batchs_ptr = coo_indices.data<IntT>();
+  const IntT* coo_rows_data =
       batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
-  const int64_t* coo_cols_data = coo_rows_data + non_zero_num;
+  const IntT* coo_cols_data = coo_rows_data + non_zero_num;
   const T* coo_values_data = coo_values.data<T>();
 
-  if (!x.coalesced()) {
-    // TODO(zhangkahuo): call coalesced() to distinct and sort the indices
-  }
-
   std::vector<int64_t> offsets(batchs, 0);
   if (batchs > 1) {
     for (int i = 0; i < non_zero_num; i++) {
@@ -220,25 +226,34 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
       csr_crows_data[b * (rows + 1) + i] = 0;
     }
     for (int64_t i = 1; i < batch_non_zero_num; i++) {
-      for (int j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
+      for (IntT j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
         csr_crows_data[b * (rows + 1) + j + 1] = i;
       }
     }
-    for (int64_t i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1;
-         i++) {
+    for (IntT i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1; i++) {
       csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num;
     }
   }
 
-  memcpy(csr_cols_data, coo_cols_data, sizeof(int64_t) * non_zero_num);
+  memcpy(csr_cols_data, coo_cols_data, sizeof(IntT) * non_zero_num);
   memcpy(csr_values_data, coo_values_data, sizeof(T) * non_zero_num);
   out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims);
 }
 
 template <typename T, typename Context>
-void SparseCooToDenseKernel(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            DenseTensor* out) {
+void SparseCooToCsrKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          SparseCsrTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseCooToCsrCPUKernel", ([&] {
+        SparseCooToCsrCPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+template <typename T, typename IntT>
+void SparseCooToDenseCPUKernel(const CPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               DenseTensor* out) {
   const auto non_zero_num = x.nnz();
   const auto dense_dims = x.dims();
   const auto indices = x.non_zero_indices();
@@ -270,8 +285,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   for (auto i = 0; i < non_zero_num; i++) {
     int64_t index = 0;
     for (int j = 0; j < sparse_dim; j++) {
-      index +=
-          indices.data<int64_t>()[j * non_zero_num + i] * sparse_offsets[j];
+      index += indices.data<IntT>()[j * non_zero_num + i] * sparse_offsets[j];
     }
 
     for (int j = 0; j < base_offset; j++) {
@@ -280,6 +294,16 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void SparseCooToDenseKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            DenseTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseCooToDenseCPUKernel", ([&] {
+        SparseCooToDenseCPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
 }  // namespace sparse
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 38553d1fe1d7a..94022d6392eea 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -15,11 +15,12 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/remove.h>
 
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
@@ -96,39 +97,33 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
                             SparseCooTensor* out) {
   const T* x_data = x.data<T>();
   const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_LE(sparse_dim,
+                    x_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "sparse_dim must be less than the size of x.dims()"));
+  PADDLE_ENFORCE_GT(
+      sparse_dim, 0, phi::errors::InvalidArgument("sparse_dim must be >0"));
   auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
   const int rows = dims_2d[0];
   const int cols = dims_2d[1];
-  auto nums_meta =
-      phi::DenseTensorMeta(DataType::INT32, {1}, phi::DataLayout::NCHW);
-  DenseTensor nums = phi::Empty(dev_ctx, std::move(nums_meta));
-  auto x_dims_meta = phi::DenseTensorMeta(DataType::INT64,
-                                          {static_cast<int64_t>(x_dims.size())},
-                                          phi::DataLayout::NCHW);
-  DenseTensor d_x_dims = phi::Empty(dev_ctx, std::move(x_dims_meta));
-
-  const auto place = dev_ctx.GetPlace();
+  DenseTensor nums = phi::Empty<int32_t>(dev_ctx, {1});
+  DenseTensor d_x_dims = phi::Empty<int64_t>(dev_ctx, {x_dims.size()});
 
   // 1. get numbers of non zero elements, and get the index of non zero elements
-  int* nums_ptr = nums.mutable_data<int>(place);
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      hipMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
-#endif
+  int* nums_ptr = nums.data<int>();
+  phi::backends::gpu::GpuMemsetAsync(
+      nums_ptr, 0, sizeof(int), dev_ctx.stream());
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
 
-  auto temp_indexs_meta =
-      phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW);
-  DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta));
-  int* temp_indexs_ptr = temp_indexs.mutable_data<int>(place);
+  DenseTensor temp_indexs = phi::Empty<int32_t>(dev_ctx, {rows});
+  int* temp_indexs_ptr = temp_indexs.data<int>();
+
   GetNonZeroNums<<<config.block_per_grid.x,
                    config.thread_per_block.x,
                    0,
                    dev_ctx.stream()>>>(
       x_data, rows, cols, nums_ptr, temp_indexs_ptr);
+
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
 #else
@@ -140,35 +135,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
 
   // 2. copy non_zero_num to host, copy x_dims to device
   int non_zero_num = 0;
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&non_zero_num,
-                                            nums_ptr,
-                                            sizeof(int),
-                                            hipMemcpyDeviceToHost,
-                                            dev_ctx.stream()));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&non_zero_num,
-                                             nums_ptr,
-                                             sizeof(int),
-                                             cudaMemcpyDeviceToHost,
-                                             dev_ctx.stream()));
-#endif
-
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      hipMemcpyAsync(d_x_dims.mutable_data<int64_t>(place),
-                     x_dims.Get(),
-                     x_dims.size() * sizeof(x_dims[0]),
-                     hipMemcpyHostToDevice,
-                     dev_ctx.stream()));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemcpyAsync(d_x_dims.mutable_data<int64_t>(place),
-                      x_dims.Get(),
-                      x_dims.size() * sizeof(x_dims[0]),
-                      cudaMemcpyHostToDevice,
-                      dev_ctx.stream()));
-#endif
+  phi::backends::gpu::GpuMemcpyAsync(&non_zero_num,
+                                     nums_ptr,
+                                     sizeof(int),
+                                     gpuMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(d_x_dims.data<int64_t>(),
+                                     x_dims.Get(),
+                                     x_dims.size() * sizeof(x_dims[0]),
+                                     gpuMemcpyHostToDevice,
+                                     dev_ctx.stream());
 
   dev_ctx.Wait();  // wait the copy
 
@@ -197,20 +173,22 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
   out->SetMember(indices, values, x_dims, true);
 }
 
-__global__ void GetBatchSizes(const int64_t* crows,
+template <typename IntT>
+__global__ void GetBatchSizes(const IntT* crows,
                               const int rows,
                               const int batchs,
-                              int* batch_sizes) {
+                              IntT* batch_sizes) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < batchs) {
     batch_sizes[tid] = crows[tid * (rows + 1) + rows];
   }
 }
 
-__global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr,
-                                         const int* crows_offsets,
-                                         int64_t* rows_ptr,
-                                         int64_t* batch_ptr,
+template <typename IntT>
+__global__ void ConvertCsrCrowsToCooRows(const IntT* crows_ptr,
+                                         const IntT* crows_offsets,
+                                         IntT* rows_ptr,
+                                         IntT* batch_ptr,
                                          const int rows) {
   const int b = blockIdx.y;
   const int64_t offset = crows_offsets ? crows_offsets[b] : 0;
@@ -227,17 +205,17 @@ __global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr,
   }
 }
 
-template <typename T, typename Context>
-void SparseCsrToCooKernel(const Context& dev_ctx,
-                          const SparseCsrTensor& x,
-                          SparseCooTensor* out) {
+template <typename T, typename IntT>
+void SparseCsrToCooGPUKernel(const GPUContext& dev_ctx,
+                             const SparseCsrTensor& x,
+                             SparseCooTensor* out) {
   const DDim& x_dims = x.dims();
   const int64_t non_zero_num = x.non_zero_cols().numel();
   const auto& csr_crows = x.non_zero_crows();
   const auto& csr_cols = x.non_zero_cols();
   const auto& csr_values = x.non_zero_elements();
-  const int64_t* csr_crows_data = csr_crows.data<int64_t>();
-  const int64_t* csr_cols_data = csr_cols.data<int64_t>();
+  const IntT* csr_crows_data = csr_crows.data<IntT>();
+  const IntT* csr_cols_data = csr_cols.data<IntT>();
   const T* csr_values_data = csr_values.data<T>();
 
   int64_t sparse_dim = 2;
@@ -247,26 +225,20 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
   int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
   int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
 
-  const auto place = dev_ctx.GetPlace();
-  DenseTensorMeta indices_meta(
-      DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW);
-  DenseTensorMeta values_meta(
-      x.dtype(), {non_zero_num}, x.non_zero_elements().layout());
-  DenseTensorMeta offsets_meta(DataType::INT32, {batchs}, DataLayout::NCHW);
-  DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta));
-  DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta));
-  DenseTensor offsets = phi::Empty(dev_ctx, std::move(offsets_meta));
-  int64_t* coo_indices = indices.mutable_data<int64_t>(place);
-  int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
-  int64_t* coo_rows_data =
+  DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
+  DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, csr_values);
+  DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batchs});
+  IntT* coo_indices = indices.data<IntT>();
+  IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
+  IntT* coo_rows_data =
       x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
-  int64_t* coo_cols_data = coo_rows_data + non_zero_num;
-  int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place);
-  T* coo_values_data = values.mutable_data<T>(place);
+  IntT* coo_cols_data = coo_rows_data + non_zero_num;
+  IntT* offsets_ptr = batchs == 1 ? nullptr : offsets.data<IntT>();
+  T* coo_values_data = values.data<T>();
 
   if (batchs > 1) {
     auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
-    GetBatchSizes<<<config.block_per_grid.x, config.thread_per_block.x>>>(
+    GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
         csr_crows_data, rows, batchs, offsets_ptr);
 
 #ifdef PADDLE_WITH_HIP
@@ -281,40 +253,38 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
 
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
   config.block_per_grid.y = batchs;
-  ConvertCsrCrowsToCooRows<<<config.block_per_grid,
-                             config.thread_per_block.x>>>(
-      csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
-
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_cols_data,
-                                            csr_cols_data,
-                                            sizeof(int64_t) * non_zero_num,
-                                            hipMemcpyDeviceToDevice,
-                                            dev_ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_values_data,
-                                            csr_values_data,
-                                            sizeof(T) * non_zero_num,
-                                            hipMemcpyDeviceToDevice,
-                                            dev_ctx.stream()));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_cols_data,
-                                             csr_cols_data,
-                                             sizeof(int64_t) * non_zero_num,
-                                             cudaMemcpyDeviceToDevice,
-                                             dev_ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_values_data,
-                                             csr_values_data,
-                                             sizeof(T) * non_zero_num,
-                                             cudaMemcpyDeviceToDevice,
-                                             dev_ctx.stream()));
-#endif
+  ConvertCsrCrowsToCooRows<IntT>
+      <<<config.block_per_grid, config.thread_per_block.x>>>(
+          csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
+
+  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
+                                     csr_cols_data,
+                                     sizeof(IntT) * non_zero_num,
+                                     gpuMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(coo_values_data,
+                                     csr_values_data,
+                                     sizeof(T) * non_zero_num,
+                                     gpuMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
 
   out->SetMember(indices, values, x_dims, true);
 }
 
-__global__ void GetBatchsOffset(const int64_t* batchs_ptr,
+template <typename T, typename Context>
+void SparseCsrToCooKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_crows().dtype(), "SparseCsrToCooGPUKernel", ([&] {
+        SparseCsrToCooGPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+template <typename IntT>
+__global__ void GetBatchsOffset(const IntT* batchs_ptr,
                                 const int non_zero_num,
-                                int64_t* batchs_offset) {
+                                IntT* batchs_offset) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
     if (i == non_zero_num - 1 || batchs_ptr[i] != batchs_ptr[i + 1]) {
@@ -323,35 +293,36 @@ __global__ void GetBatchsOffset(const int64_t* batchs_ptr,
   }
 }
 
+template <typename IntT>
 __global__ void ConvertCooRowsToCsrCrows(
-    const int64_t* batchs_offset,  // can be null if batchs = 1
-    const int64_t* coo_rows_data,
-    int64_t* csr_crows_data,
+    const IntT* batchs_offset,  // can be null if batchs = 1
+    const IntT* coo_rows_data,
+    IntT* csr_crows_data,
     const int rows,
     const int64_t non_zero_num) {
   const int b = blockIdx.y;
   int batch_non_zero_num =
       batchs_offset == nullptr ? non_zero_num : batchs_offset[b];
   if (batch_non_zero_num == 0) return;
-  int batch_start = 0;
+  IntT batch_start = 0;
   if (b > 0) {
     batch_start = batchs_offset[b - 1];
     batch_non_zero_num -= batch_start;
   }
-  auto* coo_rows_ptr = coo_rows_data + batch_start;
+  const IntT* coo_rows_ptr = coo_rows_data + batch_start;
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   for (int i = tid; i < batch_non_zero_num; i += gridDim.x * blockDim.x) {
     if (i == 0) {
-      for (int j = 0; j <= coo_rows_ptr[0]; j++) {
+      for (IntT j = 0; j <= coo_rows_ptr[0]; j++) {
         csr_crows_data[b * (rows + 1) + j] = 0;
       }
     } else {
-      for (int j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
+      for (IntT j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
         csr_crows_data[b * (rows + 1) + j + 1] = i;
       }
     }
     if (i == batch_non_zero_num - 1) {
-      for (int64_t i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1;
+      for (IntT i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1;
            i++) {
         csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num;
       }
@@ -359,10 +330,10 @@ __global__ void ConvertCooRowsToCsrCrows(
   }
 }
 
-template <typename T, typename Context>
-void SparseCooToCsrKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          SparseCsrTensor* out) {
+template <typename T, typename IntT>
+void SparseCooToCsrGPUKernel(const GPUContext& dev_ctx,
+                             const SparseCooTensor& x,
+                             SparseCsrTensor* out) {
   const auto& x_dims = x.dims();
   bool valid = x_dims.size() == 2 || x_dims.size() == 3;
   PADDLE_ENFORCE_EQ(valid,
@@ -376,78 +347,71 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
   int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
 
   phi::DenseTensor non_zero_crows =
-      phi::Empty<int64_t>(dev_ctx, {batchs * (rows + 1)});
-  phi::DenseTensor non_zero_cols = phi::Empty<int64_t>(dev_ctx, {non_zero_num});
-  phi::DenseTensor non_zero_elements = phi::Empty<T>(dev_ctx, {non_zero_num});
-  int64_t* csr_crows_data = non_zero_crows.data<int64_t>();
-  int64_t* csr_cols_data = non_zero_cols.data<int64_t>();
+      phi::Empty<IntT>(dev_ctx, {batchs * (rows + 1)});
+  phi::DenseTensor non_zero_cols = phi::Empty<IntT>(dev_ctx, {non_zero_num});
+  phi::DenseTensor non_zero_elements =
+      phi::EmptyLike<T, GPUContext>(dev_ctx, x.non_zero_elements());
+  IntT* csr_crows_data = non_zero_crows.data<IntT>();
+  IntT* csr_cols_data = non_zero_cols.data<IntT>();
   T* csr_values_data = non_zero_elements.data<T>();
 
   const auto& coo_indices = x.non_zero_indices();
   const auto& coo_values = x.non_zero_elements();
-  const int64_t* batchs_ptr = coo_indices.data<int64_t>();
-  const int64_t* coo_rows_data =
+  const IntT* batchs_ptr = coo_indices.data<IntT>();
+  const IntT* coo_rows_data =
       batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
-  const int64_t* coo_cols_data = coo_rows_data + non_zero_num;
+  const IntT* coo_cols_data = coo_rows_data + non_zero_num;
   const T* coo_values_data = coo_values.data<T>();
 
-  if (!x.coalesced()) {
-    // TODO(zhangkahuo): call coalesced() to distinct and sort the indices
-  }
-
   auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
   if (batchs > 1) {
-    DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW);
-    phi::DenseTensor batchs_offset = phi::Empty<int64_t>(dev_ctx, {batchs});
-    int64_t* batchs_offset_ptr = batchs_offset.data<int64_t>();
-    GetBatchsOffset<<<config.block_per_grid.x,
-                      config.thread_per_block.x,
-                      0,
-                      dev_ctx.stream()>>>(
-        batchs_ptr, non_zero_num, batchs_offset_ptr);
+    phi::DenseTensor batchs_offset = phi::Empty<IntT>(dev_ctx, {batchs});
+    IntT* batchs_offset_ptr = batchs_offset.data<IntT>();
+    GetBatchsOffset<IntT>
+        <<<config.block_per_grid.x,
+           config.thread_per_block.x,
+           0,
+           dev_ctx.stream()>>>(batchs_ptr, non_zero_num, batchs_offset_ptr);
     config.block_per_grid.y = batchs;
-    ConvertCooRowsToCsrCrows<<<config.block_per_grid,
-                               config.thread_per_block.x,
-                               0,
-                               dev_ctx.stream()>>>(
+    ConvertCooRowsToCsrCrows<IntT><<<config.block_per_grid,
+                                     config.thread_per_block.x,
+                                     0,
+                                     dev_ctx.stream()>>>(
         batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   } else {
-    ConvertCooRowsToCsrCrows<<<config.block_per_grid.x,
-                               config.thread_per_block.x,
-                               0,
-                               dev_ctx.stream()>>>(
+    ConvertCooRowsToCsrCrows<IntT><<<config.block_per_grid.x,
+                                     config.thread_per_block.x,
+                                     0,
+                                     dev_ctx.stream()>>>(
         nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
   }
 
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(csr_cols_data,
-                                            coo_cols_data,
-                                            sizeof(int64_t) * non_zero_num,
-                                            hipMemcpyDeviceToDevice,
-                                            dev_ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(csr_values_data,
-                                            coo_values_data,
-                                            sizeof(T) * non_zero_num,
-                                            hipMemcpyDeviceToDevice,
-                                            dev_ctx.stream()));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(csr_cols_data,
-                                             coo_cols_data,
-                                             sizeof(int64_t) * non_zero_num,
-                                             cudaMemcpyDeviceToDevice,
-                                             dev_ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(csr_values_data,
-                                             coo_values_data,
-                                             sizeof(T) * non_zero_num,
-                                             cudaMemcpyDeviceToDevice,
-                                             dev_ctx.stream()));
-#endif
+  phi::backends::gpu::GpuMemcpyAsync(csr_cols_data,
+                                     coo_cols_data,
+                                     sizeof(IntT) * non_zero_num,
+                                     gpuMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(csr_values_data,
+                                     coo_values_data,
+                                     sizeof(T) * non_zero_num,
+                                     gpuMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
   out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims);
 }
 
+template <typename T, typename Context>
+void SparseCooToCsrKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          SparseCsrTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseCooToCsrGPUKernel", ([&] {
+        SparseCooToCsrGPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
 template <typename ValueT, typename IndicesT>
 __global__ void KernelSparseCooToDense(const IndicesT* indices,
-                                       const IndicesT* sparse_offsets,
+                                       const int64_t* sparse_offsets,
                                        const ValueT* data,
                                        ValueT* dense_data,
                                        const IndicesT non_zero_num,
@@ -466,10 +430,10 @@ __global__ void KernelSparseCooToDense(const IndicesT* indices,
   }
 }
 
-template <typename T, typename Context>
-void SparseCooToDenseKernel(const Context& dev_ctx,
-                            const SparseCooTensor& x,
-                            DenseTensor* out) {
+template <typename T, typename IntT>
+void SparseCooToDenseGPUKernel(const GPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               DenseTensor* out) {
   const auto non_zero_num = x.nnz();
   const auto dense_dims = x.dims();
   const auto indices = x.non_zero_indices();
@@ -498,38 +462,24 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
     offset *= dense_dims[i];
   }
 
-  auto sparse_offset_meta = phi::DenseTensorMeta(
-      DataType::INT64, {sparse_dim}, phi::DataLayout::NCHW);
-  DenseTensor d_sparse_offsets = Empty(dev_ctx, std::move(sparse_offset_meta));
+  DenseTensor d_sparse_offsets = Empty<int64_t>(dev_ctx, {sparse_dim});
+
+  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<int64_t>(),
+                                     sparse_offsets.data(),
+                                     sparse_dim * sizeof(int64_t),
+                                     gpuMemcpyHostToDevice,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemsetAsync(
+      out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream());
 
-#ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      hipMemcpyAsync(d_sparse_offsets.mutable_data<int64_t>(place),
-                     sparse_offsets.data(),
-                     sparse_dim * sizeof(int64_t),
-                     hipMemcpyHostToDevice,
-                     dev_ctx.stream()));
-
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      hipMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
-#else
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemcpyAsync(d_sparse_offsets.mutable_data<int64_t>(place),
-                      sparse_offsets.data(),
-                      sparse_dim * sizeof(int64_t),
-                      cudaMemcpyHostToDevice,
-                      dev_ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
-#endif
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
 
-  KernelSparseCooToDense<T, int64_t>
+  KernelSparseCooToDense<T, IntT>
       <<<config.block_per_grid.x,
          config.thread_per_block.x,
          0,
-         dev_ctx.stream()>>>(indices.data<int64_t>(),
+         dev_ctx.stream()>>>(indices.data<IntT>(),
                              d_sparse_offsets.data<int64_t>(),
                              x_data,
                              out_data,
@@ -538,6 +488,16 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
                              sparse_dim);
 }
 
+template <typename T, typename Context>
+void SparseCooToDenseKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            DenseTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseCooToDenseGPUKernel", ([&] {
+        SparseCooToDenseGPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
 }  // namespace sparse
 }  // namespace phi
 
diff --git a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
index 1014d45e70a3f..f7558499fd8fc 100644
--- a/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
+++ b/paddle/phi/ops/compat/depthwise_conv2d_sig.cc
@@ -53,7 +53,7 @@ KernelSignature DepthwiseConv2dGradOpArgumentMapping(
 KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("depthwise_conv2d_grad_grad",
-                         {"DDInput", "DDFilter", "DOutput", "Input", "Filter"},
+                         {"Input", "Filter", "DOutput", "DDInput", "DDFilter"},
                          {"strides",
                           "paddings",
                           "padding_algorithm",
@@ -64,7 +64,7 @@ KernelSignature DepthwiseConv2dDoubleGradOpArgumentMapping(
                           "workspace_size_MB",
                           "exhaustive_search",
                           "fuse_relu_before_depthwise_conv"},
-                         {"DDOutput", "DInput", "DFilter"});
+                         {"DInput", "DFilter", "DDOutput"});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/ops/compat/dropout_sig.cc b/paddle/phi/ops/compat/dropout_sig.cc
index 712c5cbb0d634..403e752ca0e83 100644
--- a/paddle/phi/ops/compat/dropout_sig.cc
+++ b/paddle/phi/ops/compat/dropout_sig.cc
@@ -32,7 +32,31 @@ KernelSignature DropoutGradOpArgumentMapping(
                          {"X@GRAD"});
 }
 
+KernelSignature DropoutNdOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("dropout_nd",
+                         {"X", "Seed"},
+                         {"dropout_prob",
+                          "is_test",
+                          "dropout_implementation",
+                          "seed",
+                          "fix_seed",
+                          "axis"},
+                         {"Out", "Mask"});
+}
+
+KernelSignature DropoutNdGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "dropout_nd_grad",
+      {"Mask", "Out@GRAD"},
+      {"dropout_prob", "is_test", "dropout_implementation", "axis"},
+      {"X@GRAD"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(dropout, phi::DropoutOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(dropout_grad, phi::DropoutGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_nd, phi::DropoutNdOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dropout_nd_grad,
+                           phi::DropoutNdGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/logcumsumexp_sig.cc b/paddle/phi/ops/compat/logcumsumexp_sig.cc
new file mode 100644
index 0000000000000..2c790903b6333
--- /dev/null
+++ b/paddle/phi/ops/compat/logcumsumexp_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature LogcumsumexpOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("logcumsumexp",
+                         {"X"},
+                         {"axis", "flatten", "exclusive", "reverse"},
+                         {"Out"});
+}
+
+KernelSignature LogcumsumexpGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("logcumsumexp_grad",
+                         {"X", "Out", "Out@GRAD"},
+                         {"axis", "flatten", "exclusive", "reverse"},
+                         {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(logcumsumexp, phi::LogcumsumexpOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(logcumsumexp_grad,
+                           phi::LogcumsumexpGradOpArgumentMapping);
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 57a55963d5c66..c299559da5914 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -11,7 +11,7 @@ cc_test(test_type_info SRCS test_type_info.cc)
 cc_test(
   test_kernel_factory
   SRCS test_kernel_factory.cc
-  DEPS kernel_factory scale_kernel)
+  DEPS kernel_factory phi)
 cc_test(
   test_sparse_coo_tensor
   SRCS test_sparse_coo_tensor.cc
@@ -58,7 +58,7 @@ if(WITH_TESTING AND TEST selected_rows_test)
 endif()
 if(NOT WIN32)
   cc_test(test_rw_lock SRCS test_rw_lock.cc)
-endif(NOT WIN32)
+endif()
 cc_test(
   test_string_tensor
   SRCS test_string_tensor.cc
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 0c1089b1fd440..e33af8b1bd52e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -109,7 +109,7 @@ if(WIN32)
     COMMENT "Packing whl packages------>>>"
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
             profiler_py_proto pass_desc_py_proto ${PY_FILES})
-else(WIN32)
+else()
   add_custom_command(
     OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 75ec75cc43100..b2a94e62a1e0b 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -193,6 +193,7 @@
 from .tensor.math import cosh  # noqa: F401
 from .tensor.math import cumsum  # noqa: F401
 from .tensor.math import cumprod  # noqa: F401
+from .tensor.math import logcumsumexp  # noqa: F401
 from .tensor.math import logit  # noqa: F401
 from .tensor.math import exp  # noqa: F401
 from .tensor.math import expm1  # noqa: F401
@@ -407,6 +408,7 @@
     'eye',
     'cumsum',
     'cumprod',
+    'logcumsumexp',
     'logit',
     'sign',
     'is_empty',
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 8f42553048fec..a5b0856a66ff4 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -69,6 +69,9 @@ def _setup_nccl_op(self, startup_program, main_program, build_strategy):
         if trainer_id == 0 and not paddle.is_compiled_with_npu():
             wait_server_ready(other_trainers)
 
+        if build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy._NoReduce:
+            return
+
         if core.is_compiled_with_cuda():
             comm_id_var = startup_program.global_block().create_var(
                 name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 0d7fc17da172c..af60776a3f1c5 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -276,12 +276,12 @@ def __next__(self):
                         data = self._reader.read_next_list()
                         for i in range(len(data)):
                             data[i] = data[i]._move_to_list()
-                        data = [
-                            _restore_batch(d, s) for d, s in zip(
-                                data, self._structure_infos[:len(self._places)])
+                        structs = [
+                            self._structure_infos.pop(0)
+                            for _ in range(len(self._places))
                         ]
-                        self._structure_infos = self._structure_infos[
-                            len(self._places):]
+                        data = [_restore_batch(d, s) \
+                                for d, s in zip(data, structs)]
                         # static graph organized data on multi-device with list, if
                         # place number is 1, there is only 1 device, extra the data
                         # from list for devices to be compatible with dygraph mode
@@ -750,12 +750,12 @@ def __next__(self):
                         data = self._reader.read_next_list()
                         for i in range(len(data)):
                             data[i] = data[i]._move_to_list()
-                        data = [
-                            _restore_batch(d, s) for d, s in zip(
-                                data, self._structure_infos[:len(self._places)])
+                        structs = [
+                            self._structure_infos.pop(0)
+                            for _ in range(len(self._places))
                         ]
-                        self._structure_infos = self._structure_infos[
-                            len(self._places):]
+                        data = [_restore_batch(d, s) \
+                                for d, s in zip(data, structs)]
                         # static graph organized data on multi-device with list, if
                         # place number is 1, there is only 1 device, extra the data
                         # from list for devices to be compatible with dygraph mode
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 74f946acedb27..de53a56468485 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -18,6 +18,7 @@
 # It provides a compatibility layer between the AST of various Python versions,
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
+import os
 from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
@@ -44,6 +45,18 @@
 DECORATOR_NAMES = ['declarative', 'to_static', 'dygraph_to_static_func']
 
 
+def apply_optimization(transformers):
+    """
+    Judge wheter to apply optimized transformation, such as BreakTransformOptimizer.
+    And not all optimized transformations are applied by default. It's controlled by
+    'export FLAGS_optim_transformation=1'
+    """
+    flag = str(
+        os.environ.get('FLAGS_optim_transformation')) in ['1', 'True', 'true']
+    if flag:
+        transformers.insert(3, BreakTransformOptimizer)
+
+
 class DygraphToStaticAst(gast.NodeTransformer):
     """
     Main class to transform Dygraph to Static Graph
@@ -77,7 +90,6 @@ def transfer_from_node_type(self, node_wrapper):
             BasicApiTransformer,  # Basic Api
             TensorShapeTransformer,  # Tensor.shape -> layers.shape(Tensor)
             ListTransformer,  # List used in control flow
-            BreakTransformOptimizer,  # optimize transfromation of break in loops
             BreakContinueTransformer,  # break/continue in loops
             ReturnTransformer,  # return in functions
             LogicalTransformer,  # logical and/or/not
@@ -90,6 +102,8 @@ def transfer_from_node_type(self, node_wrapper):
             GradTransformer,  # transform paddle.grad to paddle.gradients
         ]
 
+        apply_optimization(transformers)
+
         for index, transformer in enumerate(transformers):
             self._apply(transformer, node_wrapper, log_level=index + 1)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 78cf8f3b85c9e..45a567b57f25c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -188,7 +188,7 @@ def _run_py_logical_not(x):
     return not x
 
 
-def convert_ifelse(pred, true_fn, false_fn, true_args, false_args, return_vars):
+def convert_ifelse(pred, true_fn, false_fn, true_args, false_args):
     """
     A function representation of a Python ``if/else`` statement.
 
@@ -198,15 +198,13 @@ def convert_ifelse(pred, true_fn, false_fn, true_args, false_args, return_vars):
         false_fn(callable): A callable to be performed if ``pred`` is false.
         true_args(tuple): Parameters of ``true_fn``.
         false_args(tuple): Parameters of ``false_fn``.
-        return_vars(tuple): Return variables of ``true_fn`` and ``false_fn``.
 
     Returns:
         ``true_fn(true_args)`` if the predicate ``pred`` is true else ``false_fn(false_args)`` .
 
     """
     if isinstance(pred, Variable):
-        out = _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args,
-                               return_vars)
+        out = _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args)
     else:
         out = _run_py_ifelse(pred, true_fn, false_fn, true_args, false_args)
 
@@ -246,8 +244,7 @@ def _remove_no_value_return_var(out):
         return out
 
 
-def _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args,
-                     return_vars):
+def _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args):
     pred = cast_bool_if_necessary(pred)
     return control_flow.cond(pred, lambda: true_fn(*true_args),
                              lambda: false_fn(*false_args))
@@ -316,8 +313,7 @@ def has_negative(list_shape, idx=None):
     #      # Assume x.shape=[3, -1] in static mode
     #      y = paddle.reshape(x, shape=[1, x.shape[1]])
     #      ```
-    if isinstance(x, Variable) and (in_control_flow
-                                    or has_negative(x.shape, idx)):
+    if isinstance(x, Variable) and has_negative(x.shape, idx):
         return nn.shape(x) if idx is None else nn.shape(x)[idx]
     else:
         return list(x.shape) if idx is None else x.shape[idx]
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 4c003dd599906..9a29a535ab236 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -507,7 +507,7 @@ def create_convert_ifelse_node(return_name_ids,
                                is_if_expr=False):
     """
     Create `paddle.jit.dy2static.convert_ifelse(
-            pred, true_fn, false_fn, true_args, false_args, return_vars)`
+            pred, true_fn, false_fn, true_args, false_args)`
     to replace original `python if/else` statement.
     """
 
@@ -535,17 +535,14 @@ def create_name_nodes(name_ids):
         true_func_source = true_func.name
         false_func_source = false_func.name
 
-    return_vars = create_name_nodes(return_name_ids)
-
     convert_ifelse_layer = gast.parse(
         '_jst.convert_ifelse('
-        '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args}, {return_vars})'
-        .format(pred=ast_to_source_code(pred),
-                true_fn=true_func_source,
-                false_fn=false_func_source,
-                true_args=ast_to_source_code(true_args),
-                false_args=ast_to_source_code(false_args),
-                return_vars=ast_to_source_code(return_vars))).body[0].value
+        '{pred}, {true_fn}, {false_fn}, {true_args}, {false_args})'.format(
+            pred=ast_to_source_code(pred),
+            true_fn=true_func_source,
+            false_fn=false_func_source,
+            true_args=ast_to_source_code(true_args),
+            false_args=ast_to_source_code(false_args))).body[0].value
 
     if return_name_ids:
         _, cond_node = create_assign_node(return_name_ids, convert_ifelse_layer)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index c5a3915802401..49a218412c92d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -620,6 +620,51 @@ def rollback_impl(class_instance):
 
         return getattr(self._class_instance, func_name)
 
+    def __deepcopy__(self, memo):
+        """
+        Customized behavior for copy.deepcopy, return original decorated function instead
+        of a new StaticFunction Object. StaticFunction itself is not copyable becuase it's
+        associated with class_instance.
+
+        We add __deepcopy__ here only for the following usage:
+
+        Example::
+            .. code-block:: python
+
+                import copy
+                import paddle
+
+                class Net(paddle.nn.Layer):
+                    def __init__(self):
+                        super(Net, self).__init__()
+
+                    def forward(self, x, flag=True):
+                        if flag:
+                            out = x + 1
+                        else:
+                            out = x - 1
+                        return out
+
+                x = paddle.randn([10, 1], 'float32')
+                net = paddle.jit.to_static(Net())  # convert into static mode
+
+                copy_net = copy.deepcopy(net)      # deepcopy a new net without @to_static
+        
+        Please attention that original 'net' will unwrap @to_static and rollback into simple Layer.
+        """
+        if self._class_instance is not None:
+            net_name = type(self._class_instance).__name__
+            logging_utils.log(
+                level=-1,
+                msg="Not recommend to deepcopy '{}' decorated with @to_static, it has side effect that will" \
+                    " rollback into original state before @to_static. Please deepcopy '{}' before applying @to_static."
+                .format(net_name, net_name))
+            self.rollback()
+            return self._dygraph_function.__get__(memo[id(
+                self._class_instance)])
+        else:
+            return self._dygraph_function
+
     @property
     def inputs(self):
         """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 4a477fb7d7cb6..0afe42e3e296b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -349,14 +349,14 @@ def create_api_shape_node(tensor_shape_node):
 
     if isinstance(tensor_shape_node, gast.Name):
         api_shape_node = gast.Call(
-            func=gast.parse('fluid.layers.shape').body[0].value,
+            func=gast.parse('paddle.shape').body[0].value,
             args=[tensor_shape_node],
             keywords=[])
         return api_shape_node
 
     if isinstance(tensor_shape_node, gast.Attribute):
         api_shape_node = gast.Call(
-            func=gast.parse('fluid.layers.shape').body[0].value,
+            func=gast.parse('paddle.shape').body[0].value,
             args=[tensor_shape_node.value],
             keywords=[])
         return api_shape_node
@@ -368,8 +368,8 @@ def create_api_shape_node(tensor_shape_node):
 
 
 def get_constant_variable_node(name, value, shape=[1], dtype='int64'):
-    return gast.parse('%s = fluid.layers.fill_constant(%s, "%s", %s)' %
-                      (name, str(shape), dtype, str(value)))
+    return gast.parse('%s = paddle.full(%s, "%s", %s)' %
+                      (name, str(shape), str(value), dtype))
 
 
 def get_attribute_full_name(node):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index 66885536ae46f..263c3cbae9579 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -15,12 +15,11 @@
 from __future__ import print_function
 
 import six
+import paddle
 from paddle.utils import gast
-
 from paddle.fluid import core
 from paddle.fluid import unique_name
 from paddle.fluid.framework import Variable
-from paddle.fluid.layers import fill_constant
 from paddle.fluid.layer_helper import LayerHelper
 
 __all__ = [
@@ -87,17 +86,19 @@ def create_static_variable_gast_node(name):
 
 
 def create_fill_constant_node(name, value):
-    func_code = "{} = paddle.fluid.layers.fill_constant(shape=[1], ".format(
-        name)
+    func_code = "{} = paddle.full(shape=[1], ".format(name)
     if isinstance(value, bool):
-        func_code += "dtype='bool', value={}, name='{}')".format(value, name)
+        func_code += "dtype='bool', fill_value={}, name='{}')".format(
+            value, name)
         return gast.parse(func_code).body[0]
     if isinstance(value, float):
-        func_code += "dtype='float64', value={}, name='{}')".format(value, name)
+        func_code += "dtype='float64', fill_value={}, name='{}')".format(
+            value, name)
         return gast.parse(func_code).body[0]
 
     if isinstance(value, int):
-        func_code += "dtype='int64', value={}, name='{}')".format(value, name)
+        func_code += "dtype='int64', fill_value={}, name='{}')".format(
+            value, name)
         return gast.parse(func_code).body[0]
 
 
@@ -106,12 +107,12 @@ def to_static_variable(x):
     Translate a Python Tensor to PaddlePaddle static graph Tensor
     '''
     if isinstance(x, bool):
-        return fill_constant(shape=[1], dtype='bool', value=x)
+        return paddle.full(shape=[1], dtype='bool', fill_value=x)
     if isinstance(x, float):
-        return fill_constant(shape=[1], dtype='float64', value=x)
+        return paddle.full(shape=[1], dtype='float64', fill_value=x)
 
     if isinstance(x, six.integer_types):
-        return fill_constant(shape=[1], dtype='int64', value=x)
+        return paddle.full(shape=[1], dtype='int64', fill_value=x)
 
     return x
 
@@ -121,6 +122,6 @@ def create_bool_as_type(x, value=True):
     Create a bool variable, which type is the same as x.
     '''
     if isinstance(x, Variable):
-        return fill_constant(shape=[1], value=value, dtype="bool")
+        return paddle.full(shape=[1], fill_value=value, dtype="bool")
     else:
         return value
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 2422c68622a00..9eb044188f0d1 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -804,6 +804,7 @@ def _grad_ivar(self):
     def _set_grad_ivar(self, value):
         if isinstance(self, EagerParamBase):
             self.grad = value
+            self._unset_fake_empty()
         else:
             raise TypeError(
                 "_set_grad_ivar is only supported for Parameter Tensor")
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fdd5c0b47b4dc..44ef1ff5ae6db 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -178,8 +178,7 @@ def _fallback_legacy_dygraph():
     need_fallback = False
     # Only enable eager on CPU/GPU
     is_not_support = core.is_compiled_with_xpu() or core.is_compiled_with_npu(
-    ) or core.is_compiled_with_ipu() or core.is_compiled_with_mlu(
-    ) or core.is_compiled_with_rocm()
+    ) or core.is_compiled_with_ipu() or core.is_compiled_with_mlu()
 
     if _in_eager_mode_ and is_not_support:
         # switch into legacy dygraph mode
@@ -265,16 +264,16 @@ def ipu_shard_guard(index=None, stage=None):
     Used to shard the graph on IPUs. Set each Op run on which IPU in the sharding and which stage in the pipelining.
 
     Args:
-        index(int, optional): Specify which ipu the Tensor is computed on, (such as ‘0, 1, 2, 3’).
+        index(int, optional): Specify which ipu the Tensor is computed on, (such as '0, 1, 2, 3').
             The default value is None, which means the Op only run on IPU 0.
-        stage(int, optional): Specify the computation order of the sharded model(such as ‘0, 1, 2, 3’).
+        stage(int, optional): Specify the computation order of the sharded model(such as '0, 1, 2, 3').
             The sharded model will be computed from small to large. The default value is None, 
             which means no pipelining computation order and run Ops in terms of graph.
     
     **Note**:
-    Only if the enable_manual_shard=True, the ‘index’ is able to be set not None. Please refer 
+    Only if the enable_manual_shard=True, the 'index' is able to be set not None. Please refer 
     to :code:`paddle.static.IpuStrategy` . 
-    Only if the enable_pipelining=True, the ‘stage’ is able to be set not None. Please refer 
+    Only if the enable_pipelining=True, the 'stage' is able to be set not None. Please refer 
     to :code:`paddle.static.IpuStrategy` .
     A index is allowed to match none stage or a stage. A stage is only allowed to match a new or 
     duplicated index.
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6710ddb97dc24..34971cf11941f 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -684,6 +684,7 @@ endif()
 foreach(TEST_OP ${TEST_OPS})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
+set_tests_properties(test_logcumsumexp_op PROPERTIES TIMEOUT 30)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS
                 FLAGS_inner_op_parallelism=4)
 if(WITH_GPU
@@ -977,11 +978,11 @@ if(WITH_DISTRIBUTE)
 endif()
 
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
-# Coverage pipeline use cuda 10.1 now, profiler will random hang in cuda 10.1,
+# profiler will random hang in linux cuda 10.1 or 10.2
 # see https://github.com/PaddlePaddle/Paddle/issues/29082 for details.
-# We guess there are some bugs in cuda 10.1 or 10.2,
-# since this unittest is stable in cuda 11 (py3 pipeline) now.
-if(NOT WITH_COVERAGE)
+# We guess there are some bugs in linux cuda 10.1 or 10.2,
+# since this unittest is stable in cuda 11.2 and 10.2 (windows-ci pipeline) now.
+if(NOT (LINUX AND CUDA_VERSION LESS 11.0))
   py_test_modules(test_parallel_executor_profiler MODULES
                   test_parallel_executor_profiler)
   set_tests_properties(test_parallel_executor_profiler
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index 0d96c57c2437f..f7a1a28aa91ca 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -14,7 +14,7 @@
 
 import unittest
 import time
-import paddle.fluid as fluid
+import tempfile
 import copy
 import os
 import numpy as np
@@ -145,7 +145,10 @@ def train():
     engine.predict(test_dataset, batch_size, fetch_list=['label'])
 
     # save
-    engine.save('./mlp_inf', training=False, mode='predict')
+    temp_dir = tempfile.TemporaryDirectory()
+    model_filename = os.path.join(temp_dir.name, 'mlp_inf')
+    engine.save(model_filename, training=False, mode='predict')
+    temp_dir.cleanup()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
index 4ff72173382da..09ec5131402d0 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_relaunch.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 import os
 import sys
@@ -77,16 +78,45 @@
 }
 """
 
+mapping_josn = """
+[
+  {
+    "hostname": "machine1", 
+    "addr": "127.0.0.1", 
+    "port": "768", 
+    "ranks": 
+      {
+        "0": [1], 
+        "1": [0]
+      }
+  }
+]
+"""
+
 
 class TestAutoParallelReLaunch(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_relaunch(self):
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
+        mapping_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_rank_mapping.json")
+
         cluster_json_object = json.loads(cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
 
+        mapping_josn_object = json.loads(mapping_josn)
+        with open(mapping_json_path, "w") as mapping_josn_file:
+            json.dump(mapping_josn_object, mapping_josn_file)
+
+        file_dir = os.path.dirname(os.path.abspath(__file__))
         launch_model_path = os.path.join(file_dir,
                                          "auto_parallel_relaunch_model.py")
 
@@ -96,24 +126,15 @@ def test_relaunch(self):
             coverage_args = []
 
         cmd = [sys.executable, "-u"] + coverage_args + [
-            "-m", "launch", "--cluster_topo_path", cluster_json_path,
-            "--enable_auto_mapping", "True", launch_model_path
+            "-m", "launch", "--log_dir", self.temp_dir.name,
+            "--cluster_topo_path", cluster_json_path, "--rank_mapping_path",
+            mapping_json_path, "--enable_auto_mapping", "True",
+            launch_model_path
         ]
         process = subprocess.Popen(cmd)
         process.wait()
         self.assertEqual(process.returncode, 0)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-        rank_mapping_json_path = os.path.join(
-            file_dir, "auto_parallel_rank_mapping.json")
-        if os.path.exists(rank_mapping_json_path):
-            os.remove(rank_mapping_json_path)
-        log_path = os.path.join(file_dir, "log")
-        if os.path.exists(log_path):
-            shutil.rmtree(log_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
index 5b6f898d5b7d1..dd9b0110dbebd 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 import os
 import json
@@ -1968,10 +1969,17 @@
 
 class TestCluster(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_single_machine(self):
         # Build cluster
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster_single.json")
+
         cluster_json_object = json.loads(cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
@@ -1989,14 +1997,10 @@ def test_single_machine(self):
         self.assertTrue(devices == [0, 1, 2, 3])
         self.assertTrue(involved_machine_count == 1)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-
     def test_multi_machine(self):
         # Build cluster
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster_multi.json")
         cluster_json_object = json.loads(multi_cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
@@ -2014,10 +2018,6 @@ def test_multi_machine(self):
         self.assertTrue(devices == [5, 6, 7, 10])
         self.assertTrue(involved_machine_count == 2)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
index 0d3f193e8bce8..215385787880c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
@@ -15,6 +15,7 @@
 import unittest
 import os
 import json
+import tempfile
 
 import paddle
 from paddle.distributed.auto_parallel.cluster import Cluster
@@ -32,10 +33,16 @@
 
 class TestCommOpCost(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_comm_cost(self):
         # Build cluster
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster0.json")
         cluster_json_object = json.loads(cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
@@ -92,14 +99,10 @@ def test_comm_cost(self):
                                           comm_context=comm_context)
         self.assertTrue(identity_op_cost.time >= 0)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-
     def test_cross_machine_comm_cost(self):
         # Build cluster
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster1.json")
         cluster_json_object = json.loads(multi_cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
@@ -151,10 +154,6 @@ def test_cross_machine_comm_cost(self):
                                   comm_context=comm_context)
         self.assertTrue(recv_op_cost.time > 0)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
index b8ad54cbb79e1..8d5051a3d48d4 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 import os
 import sys
@@ -31,24 +32,17 @@ def test_engine_api(self):
         else:
             coverage_args = []
 
+        tmp_dir = tempfile.TemporaryDirectory()
         cmd = [sys.executable, "-u"] + coverage_args + [
-            "-m", "launch", "--gpus", "0,1", launch_model_path
+            "-m", "launch", "--gpus", "0,1", "--log_dir", tmp_dir.name,
+            launch_model_path
         ]
 
         process = subprocess.Popen(cmd)
         process.wait()
         self.assertEqual(process.returncode, 0)
 
-        # Remove unnecessary files
-        log_path = os.path.join(file_dir, "log")
-        if os.path.exists(log_path):
-            shutil.rmtree(log_path)
-        files_path = [path for path in os.listdir('.') if '.pd' in path]
-        for path in files_path:
-            if os.path.exists(path):
-                os.remove(path)
-        if os.path.exists('rank_mapping.csv'):
-            os.remove('rank_mapping.csv')
+        tmp_dir.cleanup()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
index 911f20f114912..fe46131225759 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
@@ -15,6 +15,7 @@
 import unittest
 import os
 import json
+import tempfile
 
 import paddle
 import paddle.distributed.auto_parallel.cost as cost_model
@@ -36,6 +37,12 @@ def check_cost(cost):
 
 class TestCost(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_base_cost(self):
         cost = cost_model.Cost(memory=100, flops=200, time=0.5)
         self.assertTrue(check_cost(cost))
@@ -65,8 +72,8 @@ def test_comp_cost(self):
 
     def test_comm_cost(self):
         # Build cluster
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
         cluster_json_object = json.loads(cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
@@ -85,10 +92,6 @@ def test_comm_cost(self):
             op_desc=desc, comm_context=CommContext(cluster))
         self.assertTrue(check_cost(allreduce_cost.cost))
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-
     def test_cost_estimator(self):
         train_program = paddle.static.Program()
         cost_estimator = cost_model.CostEstimator(train_program)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
index 88ad5f98bf7d2..bc1ebd6688edb 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_gpt_planner.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 import os
 import sys
@@ -23,14 +24,29 @@
 
 class TestPlannerReLaunch(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_relaunch_with_planner(self):
-        from test_auto_parallel_relaunch import cluster_json
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        from test_auto_parallel_relaunch import cluster_json, mapping_josn
+
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
+        mapping_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_rank_mapping.json")
+
         cluster_json_object = json.loads(cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
 
+        mapping_json_object = json.loads(mapping_josn)
+        with open(mapping_json_path, "w") as mapping_json_file:
+            json.dump(mapping_json_object, mapping_json_file)
+
+        file_dir = os.path.dirname(os.path.abspath(__file__))
         launch_model_path = os.path.join(
             file_dir, "auto_parallel_relaunch_with_gpt_planner.py")
 
@@ -40,28 +56,15 @@ def test_relaunch_with_planner(self):
             coverage_args = []
 
         cmd = [sys.executable, "-u"] + coverage_args + [
-            "-m", "launch", "--cluster_topo_path", cluster_json_path,
-            "--enable_auto_mapping", "True", launch_model_path
+            "-m", "launch", "--log_dir", self.temp_dir.name,
+            "--cluster_topo_path", cluster_json_path, "--rank_mapping_path",
+            mapping_json_path, "--enable_auto_mapping", "True",
+            launch_model_path
         ]
         process = subprocess.Popen(cmd)
         process.wait()
         self.assertEqual(process.returncode, 0)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-        rank_mapping_json_path = os.path.join(
-            file_dir, "auto_parallel_rank_mapping.json")
-        if os.path.exists(rank_mapping_json_path):
-            os.remove(rank_mapping_json_path)
-        files_path = [path for path in os.listdir('.') if '.pkl' in path]
-        for path in files_path:
-            if os.path.exists(path):
-                os.remove(path)
-        log_path = os.path.join(file_dir, "log")
-        if os.path.exists(log_path):
-            shutil.rmtree(log_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
index b6fc0d7a1fa41..efcc313a2a4ca 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_relaunch_with_planner.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import tempfile
 import unittest
 import os
 import sys
@@ -23,14 +24,29 @@
 
 class TestPlannerReLaunch(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_relaunch_with_planner(self):
-        from test_auto_parallel_relaunch import cluster_json
-        file_dir = os.path.dirname(os.path.abspath(__file__))
-        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        from test_auto_parallel_relaunch import cluster_json, mapping_josn
+
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
+        mapping_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_rank_mapping.json")
+
         cluster_json_object = json.loads(cluster_json)
         with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
 
+        mapping_json_object = json.loads(mapping_josn)
+        with open(mapping_json_path, "w") as mapping_json_file:
+            json.dump(mapping_json_object, mapping_json_file)
+
+        file_dir = os.path.dirname(os.path.abspath(__file__))
         launch_model_path = os.path.join(
             file_dir, "auto_parallel_relaunch_with_planner.py")
 
@@ -40,24 +56,15 @@ def test_relaunch_with_planner(self):
             coverage_args = []
 
         cmd = [sys.executable, "-u"] + coverage_args + [
-            "-m", "launch", "--cluster_topo_path", cluster_json_path,
-            "--enable_auto_mapping", "True", launch_model_path
+            "-m", "launch", "--log_dir", self.temp_dir.name,
+            "--cluster_topo_path", cluster_json_path, "--rank_mapping_path",
+            mapping_json_path, "--enable_auto_mapping", "True",
+            launch_model_path
         ]
         process = subprocess.Popen(cmd)
         process.wait()
         self.assertEqual(process.returncode, 0)
 
-        # Remove unnecessary files
-        if os.path.exists(cluster_json_path):
-            os.remove(cluster_json_path)
-        rank_mapping_json_path = os.path.join(
-            file_dir, "auto_parallel_rank_mapping.json")
-        if os.path.exists(rank_mapping_json_path):
-            os.remove(rank_mapping_json_path)
-        log_path = os.path.join(file_dir, "log")
-        if os.path.exists(log_path):
-            shutil.rmtree(log_path)
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py
new file mode 100644
index 0000000000000..dcc12e120d689
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_deepcopy.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
+
+from test_rollback import Net, foo
+from copy import deepcopy
+
+
+class TestDeepCopy(unittest.TestCase):
+
+    def test_net(self):
+        net = Net()
+        net = paddle.jit.to_static(net)
+
+        x = paddle.randn([3, 4])
+        src_out = net(x)
+        self.assertTrue(isinstance(net.forward, StaticFunction))
+
+        copy_net = deepcopy(net)
+        copy_out = copy_net(x)
+
+        self.assertFalse(isinstance(net.forward, StaticFunction))
+        self.assertTrue(id(copy_net), id(copy_net.forward.__self__))
+        self.assertTrue(np.array_equal(src_out.numpy(), copy_out.numpy()))
+
+    def test_func(self):
+        st_foo = paddle.jit.to_static(foo)
+        x = paddle.randn([3, 4])
+        st_out = st_foo(x)
+
+        self.assertTrue(isinstance(st_foo, StaticFunction))
+
+        new_foo = deepcopy(st_foo)
+        self.assertFalse(isinstance(new_foo, StaticFunction))
+        new_out = new_foo(x)
+        self.assertTrue(np.array_equal(st_out.numpy(), new_out.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
index 56e9cabbef485..78d97a3884aed 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
@@ -122,10 +122,12 @@ def for_loop_dyfunc_not_support(max_len):
 
 
 def for_break_single_return(max_len):
+    x = 0
     for i in range(3):
         if i == 2:
             break
-    return i
+        x += 1
+    return x
 
 
 def while_loop_bool_op(x):
@@ -324,6 +326,7 @@ def _run(self, to_static):
     def test_ast_to_func(self):
         static_numpy = self._run_static()
         dygraph_numpy = self._run_dygraph()
+        print(static_numpy, dygraph_numpy)
         self.assertTrue(np.allclose(dygraph_numpy, static_numpy))
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index b656a4dc5950e..cbc6e3c540f9f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -66,8 +66,10 @@ def get_source_code(func):
 class StaticCode1():
 
     def dyfunc_with_if_else(x_v, label=None):
-        __return_value_init_0 = paddle.fluid.layers.fill_constant(
-            shape=[1], dtype='float64', value=0.0, name='__return_value_init_0')
+        __return_value_init_0 = paddle.full(shape=[1],
+                                            dtype='float64',
+                                            fill_value=0.0,
+                                            name='__return_value_init_0')
         __return_value_0 = __return_value_init_0
 
         def true_fn_0(x_v):
@@ -80,7 +82,7 @@ def false_fn_0(x_v):
 
         x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ),
-            (x_v, ), (x_v, ))
+            (x_v, ))
         __return_0 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_1(__return_0, __return_value_0, label, x_v):
@@ -95,7 +97,7 @@ def false_fn_1(__return_0, __return_value_0):
         __return_0, __return_value_0 = _jst.convert_ifelse(
             label is not None, true_fn_1, false_fn_1,
             (__return_0, __return_value_0, label, x_v),
-            (__return_0, __return_value_0), (__return_0, __return_value_0))
+            (__return_0, __return_value_0))
 
         def true_fn_2(__return_0, __return_value_0, x_v):
             __return_1 = _jst.create_bool_as_type(
@@ -108,16 +110,17 @@ def false_fn_2(__return_value_0):
 
         __return_value_0 = _jst.convert_ifelse(
             _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2,
-            (__return_0, __return_value_0, x_v), (__return_value_0, ),
-            (__return_value_0, ))
+            (__return_0, __return_value_0, x_v), (__return_value_0, ))
         return __return_value_0
 
 
 class StaticCode2():
     # TODO: Transform return statement
     def dyfunc_with_if_else(x_v, label=None):
-        __return_value_init_1 = paddle.fluid.layers.fill_constant(
-            shape=[1], dtype='float64', value=0.0, name='__return_value_init_1')
+        __return_value_init_1 = paddle.full(shape=[1],
+                                            dtype='float64',
+                                            fill_value=0.0,
+                                            name='__return_value_init_1')
         __return_value_1 = __return_value_init_1
 
         def true_fn_3(x_v):
@@ -130,7 +133,7 @@ def false_fn_3(x_v):
 
         x_v = _jst.convert_ifelse(
             fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ),
-            (x_v, ), (x_v, ))
+            (x_v, ))
         __return_2 = _jst.create_bool_as_type(label is not None, False)
 
         def true_fn_4(__return_2, __return_value_1, label, x_v):
@@ -145,7 +148,7 @@ def false_fn_4(__return_2, __return_value_1):
         __return_2, __return_value_1 = _jst.convert_ifelse(
             label is not None, true_fn_4, false_fn_4,
             (__return_2, __return_value_1, label, x_v),
-            (__return_2, __return_value_1), (__return_2, __return_value_1))
+            (__return_2, __return_value_1))
 
         def true_fn_5(__return_2, __return_value_1, x_v):
             __return_3 = _jst.create_bool_as_type(
@@ -158,8 +161,7 @@ def false_fn_5(__return_value_1):
 
         __return_value_1 = _jst.convert_ifelse(
             _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5,
-            (__return_2, __return_value_1, x_v), (__return_value_1, ),
-            (__return_value_1, ))
+            (__return_2, __return_value_1, x_v), (__return_value_1, ))
         return __return_value_1
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 3e30eb84ed671..70ff91eff5f16 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -409,9 +409,9 @@ def init_test_func(self):
         self.dygraph_func = dyfunc_with_if_2
 
     def _set_expected_op_num(self):
-        self.expected_op_num = 14
-        self.expected_shape_op_num = 2
-        self.expected_slice_op_num = 1
+        self.expected_op_num = 2
+        self.expected_shape_op_num = 0
+        self.expected_slice_op_num = 0
 
 
 # 3. Tests with control flow for loop
@@ -421,9 +421,9 @@ def init_test_func(self):
         self.dygraph_func = dyfunc_with_for_1
 
     def _set_expected_op_num(self):
-        self.expected_op_num = 22
-        self.expected_shape_op_num = 3
-        self.expected_slice_op_num = 3
+        self.expected_op_num = 7
+        self.expected_shape_op_num = 0
+        self.expected_slice_op_num = 0
 
 
 class TestTensorShapeInFor2(TestTensorShapeInFor1):
@@ -443,9 +443,9 @@ def init_test_func(self):
         self.dygraph_func = dyfunc_with_for_3
 
     def _set_expected_op_num(self):
-        self.expected_op_num = 25
-        self.expected_shape_op_num = 6
-        self.expected_slice_op_num = 3
+        self.expected_op_num = 3
+        self.expected_shape_op_num = 0
+        self.expected_slice_op_num = 0
 
 
 # 4. Tests with control flow while loop
@@ -454,6 +454,11 @@ class TestTensorShapeInWhile1(TestTensorShapeInFor1):
     def init_test_func(self):
         self.dygraph_func = dyfunc_with_while_1
 
+    def _set_expected_op_num(self):
+        self.expected_op_num = 4
+        self.expected_shape_op_num = 0
+        self.expected_slice_op_num = 0
+
 
 class TestTensorShapeInWhile2(TestTensorShapeInFor1):
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
index 377353c0ab65b..87379669714ba 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
@@ -52,19 +52,19 @@ class TestVariableTransFunc(unittest.TestCase):
 
     def test_create_fill_constant_node(self):
         node = create_fill_constant_node("a", 1.0)
-        source = "a = paddle.fluid.layers.fill_constant(shape=[1], dtype='float64', value=1.0, name='a')"
+        source = "a = paddle.full(shape=[1], dtype='float64', fill_value=1.0, name='a')"
         self.assertEqual(
             ast_to_source_code(node).replace('\n', '').replace(' ', ''),
             source.replace(' ', ''))
 
         node = create_fill_constant_node("b", True)
-        source = "b = paddle.fluid.layers.fill_constant(shape=[1], dtype='bool', value=True, name='b')"
+        source = "b = paddle.full(shape=[1], dtype='bool', fill_value=True, name='b')"
         self.assertEqual(
             ast_to_source_code(node).replace('\n', '').replace(' ', ''),
             source.replace(' ', ''))
 
         node = create_fill_constant_node("c", 4293)
-        source = "c = paddle.fluid.layers.fill_constant(shape=[1], dtype='int64', value=4293, name='c')"
+        source = "c = paddle.full(shape=[1], dtype='int64', fill_value=4293, name='c')"
         self.assertEqual(
             ast_to_source_code(node).replace('\n', '').replace(' ', ''),
             source.replace(' ', ''))
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
deleted file mode 100644
index 19abf74a55683..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-
-@unittest.skipIf(not paddle.is_compiled_with_ipu(),
-                 "core is not compiled with IPU")
-class TestRelu(IPUOpTest):
-
-    def setUp(self):
-        self.set_atol()
-        self.set_test_op()
-        self.set_training()
-        self.set_data_feed()
-        self.set_feed_attr()
-
-    def set_test_op(self):
-        self.op = paddle.fluid.layers.relu
-        self.op_attrs = {}
-
-    def set_data_feed(self):
-        data = np.random.uniform(size=[1, 3, 10, 10])
-        self.feed_fp32 = {'in_0': data.astype(np.float32)}
-        self.feed_fp16 = {'in_0': data.astype(np.float16)}
-
-    def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
-        self.feed_list = list(self.feed_fp32.keys())
-
-    @IPUOpTest.static_graph
-    def build_model(self):
-        x = paddle.static.data(name=self.feed_list[0],
-                               shape=self.feed_shape[0],
-                               dtype='float32')
-        out = self.op(x, **self.op_attrs)
-        self.fetch_list = [out.name]
-
-    def run_model(self, exec_mode):
-        self.run_op_test(exec_mode)
-
-    def test(self):
-        for m in IPUOpTest.ExecutionMode:
-            if not self.skip_mode(m):
-                self.build_model()
-                self.run_model(m)
-        self.check()
-
-
-class TestTanh(TestRelu):
-
-    def set_test_op(self):
-        self.op = F.tanh
-        self.op_attrs = {}
-
-
-class TestLog(TestRelu):
-
-    def set_test_op(self):
-        self.op = paddle.fluid.layers.log
-        self.op_attrs = {}
-
-
-class TestSigmoid(TestRelu):
-
-    def set_test_op(self):
-        self.op = F.sigmoid
-        self.op_attrs = {}
-
-
-class TestSqrt(TestRelu):
-
-    def set_test_op(self):
-        self.op = paddle.fluid.layers.sqrt
-        self.op_attrs = {}
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
new file mode 100644
index 0000000000000..eac32819f8232
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
@@ -0,0 +1,233 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+
+    def setUp(self):
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.abs
+        self.op_attrs = {}
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    @IPUOpTest.static_graph
+    def build_model(self):
+        x = paddle.static.data(name=self.feed_list[0],
+                               shape=self.feed_shape[0],
+                               dtype='float32')
+        out = self.op(x, **self.op_attrs)
+        self.fetch_list = [out.name]
+
+    def run_model(self, exec_mode):
+        self.run_op_test(exec_mode)
+
+    def test(self):
+        for m in IPUOpTest.ExecutionMode:
+            if not self.skip_mode(m):
+                self.build_model()
+                self.run_model(m)
+        self.check()
+
+
+class TestAcos(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_atol(self):
+        super().set_atol()
+        self.atol = 1e-6
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.acos
+        self.op_attrs = {}
+
+
+class TestAsin(TestAcos):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.asin
+        self.op_attrs = {}
+
+
+class TestSinh(TestAcos):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sinh
+        self.op_attrs = {}
+
+
+class TestAtan(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.atan
+        self.op_attrs = {}
+
+
+class TestCeil(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.ceil
+        self.op_attrs = {}
+
+
+class TestCos(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.cos
+        self.op_attrs = {}
+
+
+class TestCosh(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.cosh
+        self.op_attrs = {}
+
+
+class TestErf(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.erf
+        self.op_attrs = {}
+
+
+class TestExp(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.exp
+        self.op_attrs = {}
+
+
+class TestFloor(TestBase):
+
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.floor
+        self.op_attrs = {}
+
+
+class TestLog(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.log
+        self.op_attrs = {}
+
+
+class TestReciprocal(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.reciprocal
+        self.op_attrs = {}
+
+
+class TestRelu(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.relu
+        self.op_attrs = {}
+
+
+class TestRound(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.round
+        self.op_attrs = {}
+
+
+class TestSigmoid(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sigmoid
+        self.op_attrs = {}
+
+
+class TestSign(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sign
+        self.op_attrs = {}
+
+
+class TestSin(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sin
+        self.op_attrs = {}
+
+
+class TestSoftplus(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.softplus
+        self.op_attrs = {}
+
+
+class TestSoftsign(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.softsign
+        self.op_attrs = {}
+
+
+class TestSqrt(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sqrt
+        self.op_attrs = {}
+
+
+class TestTan(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.tan
+        self.op_attrs = {}
+
+
+class TestTanh(TestBase):
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.tanh
+        self.op_attrs = {}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
index 627a8fedff6aa..c2fa0e672729c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
@@ -35,6 +35,9 @@ def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
         self.model_path = os.path.join(self.temp_dir.name, "weight_decay")
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def set_atol(self):
         self.atol = 1e-6
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
index 5c680c564f437..cac8e95521d31 100644
--- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -12,18 +12,18 @@ string(REPLACE ".py" "" TEST_DIST_OPS "${TEST_DIST_OPS}")
 if(WITH_MLU)
   foreach(TEST_OP ${TEST_DIST_OPS})
     list(REMOVE_ITEM TEST_OPS ${TEST_OP})
-  endforeach(TEST_OP)
+  endforeach()
   list(REMOVE_ITEM TEST_OPS "test_spawn_mlu")
 
   foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-  endforeach(TEST_OP)
+  endforeach()
 
   if(WITH_CNCL)
     list(APPEND TEST_DIST_OPS "test_spawn_mlu")
     foreach(TEST_OP ${TEST_DIST_OPS})
       py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    endforeach(TEST_OP)
+    endforeach()
     bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh
                       ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
     bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh
@@ -50,5 +50,5 @@ if(WITH_MLU)
     set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT
                                                                       120)
     set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120)
-  endif(WITH_CNCL)
+  endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_clip_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_clip_op_mlu.py
new file mode 100644
index 0000000000000..1a8f617be6de2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_clip_op_mlu.py
@@ -0,0 +1,136 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+
+sys.path.append("..")
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+from paddle.fluid.framework import _test_eager_guard
+
+paddle.enable_static()
+
+
+class TestClipOp(OpTest):
+
+    def setUp(self):
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.max_relative_error = 0.006
+        self.python_api = paddle.clip
+
+        self.inputs = {}
+        self.initTestCase()
+
+        self.op_type = "clip"
+        self.attrs = {}
+        self.attrs['min'] = self.min
+        self.attrs['max'] = self.max
+        if 'Min' in self.inputs:
+            min_v = self.inputs['Min']
+        else:
+            min_v = self.attrs['min']
+
+        if 'Max' in self.inputs:
+            max_v = self.inputs['Max']
+        else:
+            max_v = self.attrs['max']
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        input[np.abs(input - min_v) < self.max_relative_error] = 0.5
+        input[np.abs(input - max_v) < self.max_relative_error] = 0.5
+        self.inputs['X'] = input
+        self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def initTestCase(self):
+        self.dtype = np.float32
+        self.shape = (4, 10, 10)
+        self.max = 0.8
+        self.min = 0.3
+        self.inputs['Max'] = np.array([0.8]).astype(self.dtype)
+        self.inputs['Min'] = np.array([0.1]).astype(self.dtype)
+
+
+class TestCase1(TestClipOp):
+
+    def initTestCase(self):
+        self.dtype = np.float32
+        self.shape = (8, 16, 8)
+        self.max = 0.7
+        self.min = 0.0
+
+
+class TestCase2(TestClipOp):
+
+    def initTestCase(self):
+        self.dtype = np.float32
+        self.shape = (8, 16)
+        self.max = 1.0
+        self.min = 0.0
+
+
+class TestCase3(TestClipOp):
+
+    def initTestCase(self):
+        self.dtype = np.float32
+        self.shape = (4, 8, 16)
+        self.max = 0.7
+        self.min = 0.2
+
+
+class TestCase4(TestClipOp):
+
+    def initTestCase(self):
+        self.dtype = np.float32
+        self.shape = (4, 8, 8)
+        self.max = 0.7
+        self.min = 0.2
+        self.inputs['Max'] = np.array([0.8]).astype(self.dtype)
+        self.inputs['Min'] = np.array([0.3]).astype(self.dtype)
+
+
+class TestCase5(TestClipOp):
+
+    def initTestCase(self):
+        self.dtype = np.float32
+        self.shape = (4, 8, 16)
+        self.max = 0.5
+        self.min = 0.5
+
+
+class TestCase6(TestClipOp):
+
+    def initTestCase(self):
+        self.dtype = np.float16
+        self.shape = (4, 8, 8)
+        self.max = 0.7
+        self.min = 0.2
+        self.inputs['Max'] = np.array([0.8]).astype(self.dtype)
+        self.inputs['Min'] = np.array([0.3]).astype(self.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py
new file mode 100644
index 0000000000000..08485978a5f64
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_transposed_op_mlu.py
@@ -0,0 +1,661 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+paddle.enable_static()
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+
+
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
+    padding_algorithm = attrs['padding_algorithm']
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError("Unknown Attr(padding_algorithm): '%s'. "
+                         "It can only be 'SAME' or 'VALID'." %
+                         str(padding_algorithm))
+
+    if attrs['data_format'] == 'NHWC':
+        input_ = np.transpose(input_, [0, 3, 1, 2])
+    in_n, in_c, in_h, in_w = input_.shape
+    f_c, f_out_c, f_h, f_w = filter_.shape
+    groups = attrs['groups']
+    assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c // groups
+
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    # update pad and dilation
+    def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(input_shape,
+                                                        kernel_size,
+                                                        kernel_stride):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0))
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter_.shape[2:4]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilations = [1, 1]
+        input_data_shape = input_.shape[2:4]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_h_0, pad_h_1 = pad[0], pad[0]
+    pad_w_0, pad_w_1 = pad[1], pad[1]
+    if len(pad) == 4:
+        pad_h_0, pad_h_1 = pad[0], pad[1]
+        pad_w_0, pad_w_1 = pad[2], pad[3]
+
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
+    if 'output_size' in attrs:
+        output_size = attrs['output_size']
+        out_h = output_size[0] + pad_h_0 + pad_h_1
+        out_w = output_size[1] + pad_w_0 + pad_w_1
+    out_pad_h = 0
+    out_pad_w = 0
+    if 'output_padding' in attrs:
+        out_pad_h = attrs['output_padding'][0]
+        out_pad_w = attrs['output_padding'][1]
+    out = np.zeros((in_n, out_c, out_h + out_pad_h, out_w + out_pad_w),
+                   dtype=input_.dtype)
+
+    for n in range(in_n):
+        for i in range(in_h):
+            for j in range(in_w):
+                for g in range(groups):
+                    input_masked = input_[n, g * sub_in_c:(g + 1) * sub_in_c, i,
+                                          j]  # (c)
+                    input_masked = np.reshape(input_masked, (sub_in_c, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                    for k in range(f_out_c):
+                        tmp_out = np.sum(
+                            input_masked *
+                            filter_[g * sub_in_c:(g + 1) * sub_in_c, k, :, :],
+                            axis=0)
+                        i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                        j1, j2 = j * stride[1], j * stride[1] + d_bolck_w
+                        out[n, g * f_out_c + k, i1:i2:dilations[0],
+                            j1:j2:dilations[1]] += tmp_out
+
+    out = out[:, :, pad_h_0:out_h - pad_h_1 + out_pad_h,
+              pad_w_0:out_w - pad_w_1 + out_pad_w]
+    if attrs['data_format'] == 'NHWC':
+        out = np.transpose(out, [0, 2, 3, 1])
+    return out
+
+
+class TestConv2DTransposeOp(OpTest):
+
+    def setUp(self):
+        # init as conv transpose
+        self.dtype = np.float32
+        self.set_mlu()
+        self.need_check_grad = True
+        self.is_test = False
+        self.use_cudnn = False
+        self.use_mkldnn = False
+        self.output_size = None
+        self.output_padding = []
+        self.data_format = "NCHW"
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+        self.init_op_type()
+        self.init_test_case()
+
+        input_ = np.random.random(self.input_size).astype(self.dtype)
+        filter_ = np.random.random(self.filter_size).astype(self.dtype)
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'padding_algorithm': self.padding_algorithm,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'is_test': self.is_test,
+            'use_mkldnn': self.use_mkldnn,
+            'data_format': self.data_format
+        }
+        if self.output_size is not None:
+            self.attrs['output_size'] = self.output_size
+
+        if len(self.output_padding) > 0:
+            self.attrs['output_padding'] = self.output_padding
+
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype(self.dtype)
+
+        self.outputs = {'Output': output}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_no_input(self):
+        if self.need_check_grad:
+            self.check_grad_with_place(self.place, ['Filter'],
+                                       'Output',
+                                       max_relative_error=0.02,
+                                       no_grad_set=set(['Input']))
+
+    def test_check_grad_no_filter(self):
+        if self.need_check_grad:
+            self.check_grad_with_place(self.place, ['Input'],
+                                       'Output',
+                                       no_grad_set=set(['Filter']))
+
+    def test_check_grad(self):
+        if self.need_check_grad:
+            self.check_grad_with_place(self.place,
+                                       set(['Input', 'Filter']),
+                                       'Output',
+                                       max_relative_error=0.02)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose"
+
+
+class TestWithSymmetricPad(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithAsymmetricPad(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 2]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithSAMEPad(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.stride = [2, 1]
+        self.dilations = [1, 2]
+        self.groups = 1
+        self.input_size = [2, 3, 6, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 4, 3]
+        self.padding_algorithm = 'SAME'
+
+
+class TestWithVALIDPad(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.padding_algorithm = 'VALID'
+
+
+class TestWithGroups(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+
+class TestWithStride(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithDilation(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.groups = 1
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithEvenUpsample(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_size = [14, 14]
+        self.input_size = [2, 3, 7, 7]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 5, 5]
+
+
+class TestWithEvenUpsampleOutputPadding(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 3, 7, 7]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 5, 5]
+
+
+class Test_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithSymmetricPad_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithAsymmetricPad_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 2]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithGroups_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 4]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithStride_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 3]  # NCHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithDilation_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.groups = 1
+        self.dilations = [2, 2]
+        self.input_size = [2, 5, 5, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestWithEvenUpsample_NHWC(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_size = [14, 14]
+        self.input_size = [2, 7, 7, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 5, 5]
+        self.data_format = 'NHWC'
+
+
+class TestWithEvenUpsample_NHWC_output_padding(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_padding = [1, 1]
+        self.input_size = [2, 7, 7, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 5, 5]
+        self.data_format = 'NHWC'
+
+
+class TestMLU_FP16(TestConv2DTransposeOp):
+
+    def init_test_case(self):
+        self.dtype = np.float16
+        self.set_mlu()
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def init_op_type(self):
+        self.need_check_grad = False
+        self.op_type = "conv2d_transpose"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-2)
+
+
+class TestMLU_NHWC_FP16(TestMLU_FP16):
+
+    def init_test_case(self):
+        self.dtype = np.float16
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestMLUWithGroups_NHWC_FP16(TestMLU_FP16):
+
+    def init_test_case(self):
+        self.dtype = np.float16
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 4]  # NCHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3]
+        self.data_format = 'NHWC'
+
+
+class TestMLUWithEvenUpsample_NHWC_FP16(TestMLU_FP16):
+
+    def init_test_case(self):
+        self.dtype = np.float16
+        self.pad = [2, 2]
+        self.stride = [2, 2]
+        self.groups = 1
+        self.dilations = [1, 1]
+        self.output_size = [14, 14]
+        self.input_size = [2, 7, 7, 3]  # NHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 5, 5]
+        self.data_format = 'NHWC'
+
+
+class TestConv2DTransposeAPI(unittest.TestCase):
+
+    def setUp(self):
+        self.set_mlu()
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_case1(self):
+        data1 = fluid.layers.data(name='data1',
+                                  shape=[3, 5, 5],
+                                  dtype='float32')
+        data2 = fluid.layers.data(name='data2',
+                                  shape=[5, 5, 3],
+                                  dtype='float32')
+        out1 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NCHW')
+        out2 = fluid.layers.conv2d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             data_format='NHWC')
+        out3 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [1, 1], [1, 1],
+                                                      [0, 0]],
+                                             data_format='NHWC')
+        out4 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=3,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding=[[0, 0], [0, 0], [2, 1],
+                                                      [0, 0]],
+                                             data_format='NCHW')
+        out5 = fluid.layers.conv2d_transpose(input=data2,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='SAME',
+                                             data_format='NCHW')
+        out6 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             filter_size=3,
+                                             padding='VALID',
+                                             data_format='NHWC')
+        out7 = fluid.layers.conv2d_transpose(input=data1,
+                                             groups=1,
+                                             num_filters=6,
+                                             output_size=[7, 7],
+                                             padding=[0, 0],
+                                             data_format='NHWC')
+
+        data1_np = np.random.random((2, 3, 5, 5)).astype("float32")
+        data2_np = np.random.random((2, 5, 5, 3)).astype("float32")
+
+        exe = fluid.Executor(self.place)
+        exe.run(fluid.default_startup_program())
+        results = exe.run(fluid.default_main_program(),
+                          feed={
+                              "data1": data1_np,
+                              "data2": data2_np
+                          },
+                          fetch_list=[out1, out2, out3, out4, out5, out6, out7],
+                          return_numpy=True)
+        self.assertIsNotNone(results[0])
+        self.assertIsNotNone(results[1])
+        self.assertIsNotNone(results[2])
+        self.assertIsNotNone(results[3])
+        self.assertIsNotNone(results[4])
+        self.assertIsNotNone(results[5])
+        self.assertIsNotNone(results[6])
+
+
+class TestConv2DTransposeOpException(unittest.TestCase):
+
+    def setUp(self):
+        self.set_mlu()
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_exception(self):
+        data = fluid.layers.data(name='data', shape=[3, 5, 5], dtype="float32")
+
+        def attr_data_format():
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                data_format="NCDHW")
+
+        self.assertRaises(ValueError, attr_data_format)
+
+        def attr_padding_str():
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding='Vald')
+
+        self.assertRaises(ValueError, attr_padding_str)
+
+        def attr_padding_list():
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding=[[1, 1], [1, 1], [0, 0],
+                                                         [0, 0]])
+
+        self.assertRaises(ValueError, attr_padding_list)
+
+        def attr_padding_with_data_format():
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                padding=[[1, 1], [0, 0], [0, 0],
+                                                         [1, 1]],
+                                                data_format='NHWC')
+
+        self.assertRaises(ValueError, attr_padding_with_data_format)
+
+        error_input = fluid.layers.data(name='error_data',
+                                        shape=[1],
+                                        dtype="float32")
+
+        def error_input_size():
+            out = fluid.layers.conv2d_transpose(input=error_input,
+                                                groups=1,
+                                                num_filters=6,
+                                                filter_size=3)
+
+        self.assertRaises(ValueError, error_input_size)
+
+        def error_groups():
+            out = fluid.layers.conv2d_transpose(input=data,
+                                                groups=0,
+                                                num_filters=6,
+                                                filter_size=3,
+                                                data_format='NHWC')
+
+        self.assertRaises(ValueError, error_groups)
+
+
+class TestConv2DTransposeRepr(unittest.TestCase):
+
+    def setUp(self):
+        self.set_mlu()
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_case(self):
+        paddle.disable_static()
+        x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
+        conv = nn.Conv2DTranspose(4, 6, (3, 3), output_padding=1, stride=2)
+        print(conv)
+        y_var = conv(x_var)
+        y_np = y_var.numpy()
+        self.assertIsNotNone(y_np)
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py
new file mode 100644
index 0000000000000..deee1a38b3101
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gather_nd_op_mlu.py
@@ -0,0 +1,309 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import sys
+
+sys.path.append('..')
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
+
+
+def gather_nd_grad(x, index):
+    # for TestGatherNdOpWithLowIndex
+    dout_shape = index.shape[:-1] + x.shape[index.shape[-1]:]
+    numel = 1
+    for i in dout_shape:
+        numel = numel * i
+    dout = np.full(dout_shape, 1. / numel)
+    dx = np.full_like(x, 0)
+
+    index = tuple(index.reshape(-1, index.shape[-1]).T)
+    np.add.at(dx, index, dout)
+
+    return dx
+
+
+def test_class1(op_type, typename):
+
+    class TestGatherNdOpWithEmptyIndex(OpTest):
+        # Index has empty element, which means copy entire tensor
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            xnp = np.random.random((5, 20)).astype(typename)
+            self.inputs = {
+                'X': xnp,
+                'Index': np.array([[], []]).astype("int32")
+            }
+            self.outputs = {
+                'Out': np.vstack((xnp[np.newaxis, :], xnp[np.newaxis, :]))
+            }
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    cls_name = "{0}_{1}_1".format(op_type, typename)
+    TestGatherNdOpWithEmptyIndex.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithEmptyIndex
+
+
+def test_class2(op_type, typename):
+
+    class TestGatherNdOpWithIndex1(OpTest):
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            xnp = np.random.random((5, 20)).astype(typename)
+            self.inputs = {'X': xnp, 'Index': np.array([1]).astype("int32")}
+            self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    cls_name = "{0}_{1}_2".format(op_type, typename)
+    TestGatherNdOpWithIndex1.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithIndex1
+
+
+def test_class3(op_type, typename):
+
+    class TestGatherNdOpWithLowIndex(OpTest):
+        #Index has low rank, X has high rank
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([[1], [2]]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+            self.x_grad = gather_nd_grad(xnp, index)
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'],
+                                           'Out',
+                                           user_defined_grads=[self.x_grad])
+
+    cls_name = "{0}_{1}_3".format(op_type, typename)
+    TestGatherNdOpWithLowIndex.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithLowIndex
+
+
+def test_class4(op_type, typename):
+
+    class TestGatherNdOpIndex1(OpTest):
+        #Index has low rank, X has high rank
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([1, 2]).astype("int32")
+
+            self.inputs = {'X': xnp, 'Index': index}
+
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    cls_name = "{0}_{1}_4".format(op_type, typename)
+    TestGatherNdOpIndex1.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpIndex1
+
+
+def test_class5(op_type, typename):
+
+    class TestGatherNdOpWithSameIndexAsX(OpTest):
+        #Index has same rank as X's rank
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            xnp = np.random.uniform(0, 100, (10, 10)).astype(typename)
+            index = np.array([[1, 1], [2, 1]]).astype("int64")
+
+            self.inputs = {'X': xnp, 'Index': index}
+            self.outputs = {'Out': xnp[tuple(index.T)]}  #[25, 22]
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    cls_name = "{0}_{1}_5".format(op_type, typename)
+    TestGatherNdOpWithSameIndexAsX.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithSameIndexAsX
+
+
+def test_class6(op_type, typename):
+
+    class TestGatherNdOpWithHighRankSame(OpTest):
+        #Both Index and X have high rank, and Rank(Index) = Rank(X)
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            shape = (5, 2, 3, 1, 10)
+            xnp = np.random.rand(*shape).astype(typename)
+            index = np.vstack([np.random.randint(0, s, size=2)
+                               for s in shape]).T
+
+            self.inputs = {'X': xnp, 'Index': index.astype("int32")}
+            self.outputs = {'Out': xnp[tuple(index.T)]}
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    cls_name = "{0}_{1}_6".format(op_type, typename)
+    TestGatherNdOpWithHighRankSame.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithHighRankSame
+
+
+def test_class7(op_type, typename):
+
+    class TestGatherNdOpWithHighRankDiff(OpTest):
+        #Both Index and X have high rank, and Rank(Index) < Rank(X)
+
+        def setUp(self):
+            self.set_mlu()
+            self.op_type = "gather_nd"
+            self.python_api = paddle.gather_nd
+            shape = (2, 3, 4, 1, 10)
+            xnp = np.random.rand(*shape).astype(typename)
+            index = np.vstack(
+                [np.random.randint(0, s, size=200) for s in shape]).T
+            index_re = index.reshape([20, 5, 2, 5])
+
+            self.inputs = {'X': xnp, 'Index': index_re.astype("int32")}
+            self.outputs = {'Out': xnp[tuple(index.T)].reshape([20, 5, 2])}
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+            self.place = paddle.device.MLUPlace(0)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            if typename == "float16":
+                self.__class__.no_need_check_grad = True
+            else:
+                self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    cls_name = "{0}_{1}_7".format(op_type, typename)
+    TestGatherNdOpWithHighRankDiff.__name__ = cls_name
+    globals()[cls_name] = TestGatherNdOpWithHighRankDiff
+
+
+#Test Python API
+class TestGatherNdAPI2(unittest.TestCase):
+
+    def test_imperative(self):
+        paddle.disable_static()
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("float32")
+        index_1 = np.array([[1]]).astype("int32")
+        input = fluid.dygraph.to_variable(input_1)
+        index = fluid.dygraph.to_variable(index_1)
+        output = paddle.fluid.layers.gather(input, index)
+        output_np = output.numpy()
+        expected_output = np.array([3, 4])
+        self.assertTrue(np.allclose(output_np, expected_output))
+        paddle.enable_static()
+
+
+for _typename in {'float16', 'float32'}:
+    test_class1('gather_nd', _typename)
+    test_class2('gather_nd', _typename)
+    test_class3('gather_nd', _typename)
+    test_class4('gather_nd', _typename)
+    test_class5('gather_nd', _typename)
+    test_class6('gather_nd', _typename)
+    test_class7('gather_nd', _typename)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_meshgrid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_meshgrid_op_mlu.py
new file mode 100644
index 0000000000000..5a5a60087290d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_meshgrid_op_mlu.py
@@ -0,0 +1,205 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append('..')
+from op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+import paddle
+from paddle.fluid import compiler, Program, program_guard, core
+from paddle.fluid.framework import _test_eager_guard
+
+paddle.enable_static()
+
+
+class TestMeshgridOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "meshgrid"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.dtype = self.get_dtype()
+        ins, outs = self.init_test_data()
+        self.inputs = {'X': [('x%d' % i, ins[i]) for i in range(len(ins))]}
+        self.outputs = {
+            'Out': [('out%d' % i, outs[i]) for i in range(len(outs))]
+        }
+
+    def get_dtype(self):
+        return "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def init_test_data(self):
+        self.shape = self.get_x_shape()
+        ins = []
+        outs = []
+        for i in range(len(self.shape)):
+            ins.append(np.random.random((self.shape[i], )).astype(self.dtype))
+
+        for i in range(len(self.shape)):
+            out_reshape = [1] * len(self.shape)
+            out_reshape[i] = self.shape[i]
+            out_temp = np.reshape(ins[i], out_reshape)
+            outs.append(np.broadcast_to(out_temp, self.shape))
+        return ins, outs
+
+    def get_x_shape(self):
+        return [100, 200]
+
+
+class TestMeshgridOp2(TestMeshgridOp):
+
+    def get_x_shape(self):
+        return [100, 300]
+
+
+class TestMeshgridOp3(unittest.TestCase):
+
+    def test_api(self):
+        x = fluid.data(shape=[100], dtype='int32', name='x')
+        y = fluid.data(shape=[200], dtype='int32', name='y')
+
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
+
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        exe = fluid.Executor(place=fluid.MLUPlace(0))
+        grid_x, grid_y = paddle.tensor.meshgrid(x, y)
+        res_1, res_2 = exe.run(fluid.default_main_program(),
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
+                               fetch_list=[grid_x, grid_y])
+        assert np.array_equal(res_1, out_1)
+        assert np.array_equal(res_2, out_2)
+
+
+class TestMeshgridOp4(unittest.TestCase):
+
+    def test_list_input(self):
+        x = fluid.data(shape=[100], dtype='int32', name='x')
+        y = fluid.data(shape=[200], dtype='int32', name='y')
+
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
+
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        exe = fluid.Executor(place=fluid.MLUPlace(0))
+        grid_x, grid_y = paddle.tensor.meshgrid([x, y])
+        res_1, res_2 = exe.run(fluid.default_main_program(),
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
+                               fetch_list=[grid_x, grid_y])
+
+        assert np.array_equal(res_1, out_1)
+        assert np.array_equal(res_2, out_2)
+
+
+class TestMeshgridOp5(unittest.TestCase):
+
+    def test_tuple_input(self):
+        x = fluid.data(shape=[100], dtype='int32', name='x')
+        y = fluid.data(shape=[200], dtype='int32', name='y')
+
+        input_1 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
+
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        exe = fluid.Executor(place=fluid.MLUPlace(0))
+        grid_x, grid_y = paddle.tensor.meshgrid((x, y))
+        res_1, res_2 = exe.run(fluid.default_main_program(),
+                               feed={
+                                   'x': input_1,
+                                   'y': input_2
+                               },
+                               fetch_list=[grid_x, grid_y])
+
+        assert np.array_equal(res_1, out_1)
+        assert np.array_equal(res_2, out_2)
+
+
+class TestMeshgridOp7(unittest.TestCase):
+
+    def test_api_with_dygraph_list_input(self):
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
+
+        with fluid.dygraph.guard():
+            tensor_3 = fluid.dygraph.to_variable(input_3)
+            tensor_4 = fluid.dygraph.to_variable(input_4)
+            res_3, res_4 = paddle.tensor.meshgrid([tensor_3, tensor_4])
+
+            assert np.array_equal(res_3.shape, [100, 200])
+            assert np.array_equal(res_4.shape, [100, 200])
+
+
+class TestMeshgridOp8(unittest.TestCase):
+
+    def test_api_with_dygraph_tuple_input(self):
+        input_3 = np.random.randint(0, 100, [
+            100,
+        ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [
+            200,
+        ]).astype('int32')
+
+        with fluid.dygraph.guard():
+            tensor_3 = fluid.dygraph.to_variable(input_3)
+            tensor_4 = fluid.dygraph.to_variable(input_4)
+            res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4))
+
+            assert np.array_equal(res_3.shape, [100, 200])
+            assert np.array_equal(res_4.shape, [100, 200])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py
new file mode 100644
index 0000000000000..445dc449236b3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_randperm_op_mlu.py
@@ -0,0 +1,222 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append('..')
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+import os
+
+paddle.enable_static()
+
+
+def check_randperm_out(n, data_np):
+    assert isinstance(data_np, np.ndarray), \
+        "The input data_np should be np.ndarray."
+    gt_sorted = np.arange(n)
+    out_sorted = np.sort(data_np)
+    return list(gt_sorted == out_sorted)
+
+
+def error_msg(data_np):
+    return "The sorted ground truth and sorted out should " + \
+ "be equal, out = " + str(data_np)
+
+
+def convert_dtype(dtype_str):
+    dtype_str_list = ["int32", "int64", "float32", "float64"]
+    dtype_num_list = [
+        core.VarDesc.VarType.INT32, core.VarDesc.VarType.INT64,
+        core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64
+    ]
+    assert dtype_str in dtype_str_list, dtype_str + \
+        " should in " + str(dtype_str_list)
+    return dtype_num_list[dtype_str_list.index(dtype_str)]
+
+
+class TestRandpermOp(OpTest):
+    """ Test randperm op."""
+
+    def setUp(self):
+        self.op_type = "randperm"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.randperm
+        self.n = 200
+        self.dtype = "int64"
+
+        self.inputs = {}
+        self.outputs = {"Out": np.zeros((self.n)).astype(self.dtype)}
+        self.init_attrs()
+        self.attrs = {
+            "n": self.n,
+            "dtype": convert_dtype(self.dtype),
+        }
+
+    def init_attrs(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(self.verify_output, self.place)
+
+    def verify_output(self, outs):
+        out_np = np.array(outs[0])
+        self.assertTrue(check_randperm_out(self.n, out_np),
+                        msg=error_msg(out_np))
+
+
+class TestRandpermOpN(TestRandpermOp):
+
+    def init_attrs(self):
+        self.n = 10000
+
+
+class TestRandpermOpInt32(TestRandpermOp):
+
+    def init_attrs(self):
+        self.dtype = "int32"
+
+
+class TestRandpermOpFloat32(TestRandpermOp):
+
+    def init_attrs(self):
+        self.dtype = "float32"
+
+
+class TestRandpermOpFloat64(TestRandpermOp):
+
+    def init_attrs(self):
+        self.dtype = "float64"
+
+
+class TestRandpermOpError(unittest.TestCase):
+
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            self.assertRaises(ValueError, paddle.randperm, -3)
+            self.assertRaises(TypeError, paddle.randperm, 10, 'int8')
+
+
+class TestRandpermAPI(unittest.TestCase):
+
+    def test_out(self):
+        n = 10
+        place = paddle.MLUPlace(0)
+        with program_guard(Program(), Program()):
+            x1 = paddle.randperm(n)
+            x2 = paddle.randperm(n, 'float32')
+
+            exe = paddle.static.Executor(place)
+            res = exe.run(fetch_list=[x1, x2])
+
+            self.assertEqual(res[0].dtype, np.int64)
+            self.assertEqual(res[1].dtype, np.float32)
+            self.assertTrue(check_randperm_out(n, res[0]))
+            self.assertTrue(check_randperm_out(n, res[1]))
+
+
+class TestRandpermImperative(unittest.TestCase):
+
+    def test_out(self):
+        paddle.disable_static()
+        n = 10
+        for dtype in ['int32', np.int64, 'float32', 'float64']:
+            data_p = paddle.randperm(n, dtype)
+            data_np = data_p.numpy()
+            self.assertTrue(check_randperm_out(n, data_np),
+                            msg=error_msg(data_np))
+        paddle.enable_static()
+
+
+class TestRandomValue(unittest.TestCase):
+
+    def test_fixed_random_number(self):
+        # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        print("Test Fixed Random number on GPU------>")
+        paddle.disable_static()
+        paddle.set_device('gpu')
+        paddle.seed(2021)
+
+        x = paddle.randperm(30000, dtype='int32').numpy()
+        expect = [
+            24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='int64').numpy()
+        expect = [
+            6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float32').numpy()
+        expect = [
+            5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144.,
+            22906., 10705.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564.,
+            26991.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339.,
+            4662.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+
+        x = paddle.randperm(30000, dtype='float64').numpy()
+        expect = [
+            19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147.,
+            9163.
+        ]
+        self.assertTrue(np.array_equal(x[0:10], expect))
+        expect = [
+            15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945.,
+            17624.
+        ]
+        self.assertTrue(np.array_equal(x[10000:10010], expect))
+        expect = [
+            10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202.,
+            21404.
+        ]
+        self.assertTrue(np.array_equal(x[20000:20010], expect))
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_range_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_range_op_mlu.py
new file mode 100644
index 0000000000000..f87bd2e85da69
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_range_op_mlu.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+
+sys.path.append("..")
+import paddle
+import unittest
+import numpy as np
+from op_test import OpTest
+from functools import partial
+
+paddle.enable_static()
+
+
+def arange_wrapper(start, end, step, dtype=None):
+    return paddle.arange(start, end, step, dtype)
+
+
+class TestRangeOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "range"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.init_config()
+        self.inputs = {
+            'Start': np.array([self.case[0]]).astype(self.dtype),
+            'End': np.array([self.case[1]]).astype(self.dtype),
+            'Step': np.array([self.case[2]]).astype(self.dtype)
+        }
+
+        self.outputs = {
+            'Out':
+            np.arange(self.case[0], self.case[1],
+                      self.case[2]).astype(self.dtype)
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = partial(arange_wrapper, dtype=self.dtype)
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+
+class TestFloatRangeOpCase0(TestRangeOp):
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = partial(arange_wrapper, dtype=self.dtype)
+        self.case = (0, 5, 1)
+
+
+class TestInt32RangeOpCase0(TestRangeOp):
+
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = partial(arange_wrapper, dtype=self.dtype)
+        self.case = (0, 5, 2)
+
+
+class TestInt32RangeOpCase1(TestRangeOp):
+
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = partial(arange_wrapper, dtype=self.dtype)
+        self.case = (10, 1, -2)
+
+
+class TestInt32RangeOpCase2(TestRangeOp):
+
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = partial(arange_wrapper, dtype=self.dtype)
+        self.case = (-1, -10, -2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_prod_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_prod_op_mlu.py
new file mode 100644
index 0000000000000..b1efc5e7bb84a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_prod_op_mlu.py
@@ -0,0 +1,99 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+def raw_reduce_prod(x, dim=[0], keep_dim=False):
+    return paddle.prod(x, dim, keep_dim)
+
+
+class TestProdOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = raw_reduce_prod
+        self.init_data_type()
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.data_type)}
+        self.outputs = {'Out': self.inputs['X'].prod(axis=0)}
+
+    def init_data_type(self):
+        self.data_type = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+
+class TestProd6DOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = raw_reduce_prod
+        self.init_data_type()
+        self.inputs = {
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.data_type)
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def init_data_type(self):
+        self.data_type = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+
+class TestProd8DOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = raw_reduce_prod
+        self.init_data_type()
+        self.inputs = {
+            'X': np.random.random(
+                (2, 5, 3, 2, 2, 3, 4, 2)).astype(self.data_type)
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def init_data_type(self):
+        self.data_type = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py
new file mode 100644
index 0000000000000..0725a27e5125a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py
@@ -0,0 +1,247 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+
+sys.path.append("..")
+import unittest
+import numpy as np
+import os
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.dygraph.base import switch_to_static_graph
+
+
+class TestScatterOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "scatter"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.scatter
+        ref_np = np.ones((3, 50)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 50)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
+
+
+class TestScatterOp0(OpTest):
+
+    def setUp(self):
+        self.op_type = "scatter"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.scatter
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.attrs = {'overwrite': True}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
+
+
+class TestScatterOp1(OpTest):
+
+    def setUp(self):
+        self.op_type = "scatter"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.scatter
+        ref_np = np.ones((3, 3)).astype("float32")
+        zeros_np = np.zeros([2, 3]).astype('float32')
+        index_np = np.array([1, 1]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = zeros_np
+        for i in range(0, len(index_np)):
+            output_np[index_np[i]] += updates_np[i]
+        self.attrs = {'overwrite': False}
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
+
+
+class TestScatterOp2(OpTest):
+
+    def setUp(self):
+        self.op_type = "scatter"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.scatter
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int64")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Updates'], 'Out', check_eager=False)
+
+
+class TestScatterAPI(unittest.TestCase):
+
+    def setUp(self):
+        self.places = [paddle.device.MLUPlace(0)]
+        self.__class__.use_mlu = True
+        self.executed_api()
+
+    def executed_api(self):
+        self.scatter = paddle.scatter
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[3, 2], dtype="float32")
+            index = fluid.data(name="index", shape=[4], dtype="int64")
+            updates = fluid.data(name="updates", shape=[4, 2], dtype="float32")
+            result = self.scatter(input, index, updates, False)
+
+            input_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
+            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+            updates_data = np.array([[1, 1], [2, 2], [3, 3],
+                                     [4, 4]]).astype(np.float32)
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={
+                                  "input": input_data,
+                                  "index": index_data,
+                                  "updates": updates_data
+                              },
+                              fetch_list=[result])
+            self.assertEqual((fetches[0] == \
+                              np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
+                index_data = np.array([2, 1, 0, 1]).astype(np.int64)
+                updates_data = np.array([[1, 1], [2, 2], [3, 3],
+                                         [4, 4]]).astype(np.float32)
+
+                x = fluid.dygraph.to_variable(x_data)
+                index = fluid.dygraph.to_variable(index_data)
+                updates = fluid.dygraph.to_variable(updates_data)
+
+                output1 = self.scatter(x, index, updates, overwrite=False)
+                self.assertEqual((output1.numpy() == \
+                                  np.array([[3., 3.],[6., 6.],[1., 1.]])).all(), True)
+
+    def test_large_data(self):
+        if os.name == "nt":
+            return
+
+        x = np.random.rand(183826, 256).astype("float32")
+        index = np.ones(8388608, dtype="int64")
+        updates = np.ones(shape=[8388608, 256], dtype="float32")
+
+        def test_dygraph():
+            with fluid.dygraph.guard():
+                mlu_out = paddle.scatter(paddle.to_tensor(x),
+                                         paddle.to_tensor(index),
+                                         paddle.to_tensor(updates))
+                return mlu_out.numpy()
+
+        @switch_to_static_graph
+        def test_static_graph():
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
+                x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
+                index_t = paddle.static.data(name="index",
+                                             dtype=index.dtype,
+                                             shape=index.shape)
+                updates_t = paddle.static.data(name="updates",
+                                               dtype=updates.dtype,
+                                               shape=updates.shape)
+                out_t = paddle.scatter(x_t, index_t, updates_t)
+                feed = {
+                    x_t.name: x,
+                    index_t.name: index,
+                    updates_t.name: updates
+                }
+                fetch = [out_t]
+
+                mlu_exe = paddle.static.Executor(paddle.device.MLUPlace(0))
+                mlu_value = mlu_exe.run(feed=feed, fetch_list=fetch)[0]
+                return mlu_value
+
+        self.assertTrue(np.array_equal(test_dygraph(), test_static_graph()))
+
+
+class TestScatterOpFp16(OpTest):
+
+    def setUp(self):
+        self.op_type = "scatter"
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.python_api = paddle.scatter
+        ref_np = np.ones((3, 3)).astype("float16")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float16")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.attrs = {'overwrite': True}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X", "Updates"], "Out", check_eager=False)
+
+
+class TestScatterInplaceAPI(TestScatterAPI):
+
+    def executed_api(self):
+        self.scatter = paddle.scatter_
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sqrt_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sqrt_op_mlu.py
new file mode 100644
index 0000000000000..a7bdc162acdb1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sqrt_op_mlu.py
@@ -0,0 +1,89 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+import paddle
+import paddle.nn.functional as F
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestSqrt(OpTest):
+
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.dtype = 'float32'
+        self.set_mlu()
+        self.python_api = paddle.sqrt
+
+        np.random.seed(1023)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out', check_eager=False)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestSqrtHalf(OpTest):
+
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.dtype = 'float16'
+        self.set_mlu()
+        self.python_api = paddle.sqrt
+
+        np.random.seed(1023)
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'],
+                                   'Out',
+                                   check_eager=False,
+                                   max_relative_error=0.85)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
index 7ef5516bc047e..176f419341606 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cluster.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import tempfile
 import unittest
 import os
 import json
@@ -201,15 +202,21 @@
 
 class TestAutoParallelCluster(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_cluster(self):
-        cluster_json_file = ""
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
         cluster_json_object = json.loads(cluster_json)
-        with open("./auto_parallel_cluster.json", "w") as cluster_json_file:
+        with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
 
         cluster = Cluster()
-        cluster.build_from_file("./auto_parallel_cluster.json")
-        os.remove("./auto_parallel_cluster.json")
+        cluster.build_from_file(cluster_json_path)
 
         self.assertEqual(len(cluster.get_all_devices("GPU")), 4)
         self.assertEqual(len(cluster.get_all_devices("CPU")), 2)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index a147b0f1f376a..36923212fdfa1 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import tempfile
 import unittest
 import os
 import json
@@ -527,14 +528,20 @@ def get_device_local_ids(machine):
 
 class TestAutoParallelMapper(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_mapper_dp_mp_pp(self):
-        cluster_json_file = ""
+        cluster_json_path = os.path.join(self.temp_dir.name,
+                                         "auto_parallel_cluster.json")
         cluster_json_object = json.loads(cluster_json)
-        with open("./auto_parallel_cluster.json", "w") as cluster_json_file:
+        with open(cluster_json_path, "w") as cluster_json_file:
             json.dump(cluster_json_object, cluster_json_file)
         cluster = Cluster()
-        cluster.build_from_file("./auto_parallel_cluster.json")
-        os.remove("./auto_parallel_cluster.json")
+        cluster.build_from_file(cluster_json_path)
 
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp_pp"
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 96738a466626e..00ba2151fcba5 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -892,25 +892,6 @@ def test_gpt_dp_mp(self):
         auto_parallel_main_prog, auto_parallel_startup_prog, params_grads = partitioner.partition(
             complete_train_program, startup_program, params_grads)
 
-        with open("./test_auto_parallel_partitioner_serial_main_new.txt",
-                  "w") as fw:
-            fw.write(str(train_program))
-        with open("./test_auto_parallel_partitioner_serial_startup_new.txt",
-                  "w") as fw:
-            fw.write(str(startup_program))
-
-        from paddle.distributed.auto_parallel.dist_context import set_default_distributed_context
-        set_default_distributed_context(dist_context)
-        with open("./test_auto_parallel_partitioner_main_new.txt1", "w") as fw:
-            fw.write(str(auto_parallel_main_prog))
-        with open("./test_auto_parallel_partitioner_startup_new.txt1",
-                  "w") as fw:
-            fw.write(str(auto_parallel_startup_prog))
-        # with open("./test_auto_parallel_partitioner_main_completed.txt", "w") as fw:
-        #     from paddle.distributed.auto_parallel.completion import Completer
-        #     completer = Completer()
-        #     completer.complete_forward_annotation(auto_parallel_main_prog)
-        #     fw.write(str(auto_parallel_main_prog))
         nrank = 4
         # col parallel
         weights = [
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py b/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
new file mode 100644
index 0000000000000..c696863c612b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dropout_nd_op.py
@@ -0,0 +1,130 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _non_static_mode
+from paddle import _C_ops
+from paddle.static import default_main_program
+
+
+def dropout_nd(x,
+               p=0.5,
+               axis=None,
+               training=True,
+               mode="upscale_in_train",
+               name=None):
+    drop_axes = [axis] if isinstance(axis, int) else list(axis)
+    seed = None
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    if _non_static_mode():
+        if default_main_program().random_seed != 0:
+            seed = default_main_program().random_seed
+
+        out, mask = _C_ops.dropout_nd(x, 'dropout_prob', p, 'is_test',
+                                      not training, 'fix_seed', seed
+                                      is not None, 'seed',
+                                      seed if seed is not None else 0,
+                                      'dropout_implementation', mode, 'axis',
+                                      drop_axes)
+        return out
+
+    helper = LayerHelper('dropout_nd', **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'dropout')
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    mask = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+    def get_attrs(prog, dropout_prob, is_test, seed):
+        if (seed is None or seed == 0) and prog.random_seed != 0:
+            seed = prog.random_seed
+        attrs = {
+            'dropout_prob': dropout_prob,
+            'is_test': is_test,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0,
+            'dropout_implementation': mode,
+            'axis': drop_axes
+        }
+        return attrs
+
+    attrs = get_attrs(helper.main_program, p, not training, seed)
+
+    helper.append_op(type='dropout_nd',
+                     inputs={'X': [x]},
+                     outputs={
+                         'Out': [out],
+                         'Mask': [mask]
+                     },
+                     attrs=attrs)
+    return out
+
+
+paddle.enable_static()
+
+
+class TestDropoutNdOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "dropout_nd"
+        self.inputs = {'X': np.random.random((4, 32, 16)).astype("float64")}
+        self.attrs = {
+            'dropout_prob': 0.0,
+            'fix_seed': True,
+            'is_test': False,
+            'axis': [1]
+        }
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((1, 32, 1)).astype('uint8')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestDropoutNdAPI(unittest.TestCase):
+
+    def setUp(self):
+        np.random.seed(123)
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                in_np = np.random.random([4, 32, 16]).astype("float32")
+                input = paddle.to_tensor(in_np)
+                res1 = dropout_nd(x=input, p=0., axis=[0, 1])
+                res2 = dropout_nd(x=input, p=0.5, axis=[0, 1])
+            self.assertTrue(np.allclose(res1.numpy(), in_np))
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
index d6ccec25a43f8..b707640923adb 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
@@ -27,6 +27,7 @@
 from paddle import _C_ops
 
 import sys
+import tempfile
 
 sys.path.append("./tokenizer")
 from tokenizer.bert_tokenizer import BertTokenizer
@@ -157,11 +158,15 @@ def predict(self, data):
 class TestBertTokenizerOp(unittest.TestCase):
 
     def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
         self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
-        self.save_path = os.path.join(DATA_HOME, "fast_tokenizer")
+        self.save_path = os.path.join(self.temp_dir.name, "fast_tokenizer")
         self.param_path = os.path.join(self.save_path, "model.pdparams")
         self.inference_path = os.path.join(self.save_path, "inference")
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def init_data(self):
         self.faster_tokenizer = FasterTokenizer(self.bert_tokenizer.vocab)
         self.text = [
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index 2c860a0a6243b..36bec7fb0301f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -17,12 +17,15 @@
 import paddle.fluid.framework as framework
 from paddle.fluid.dygraph.nn import *
 import numpy as np
+import os
+import tempfile
 
 
 class TestDygraphLoadStatic(unittest.TestCase):
 
     def testLoadStaticModel(self):
         # static mode
+        temp_dir = tempfile.TemporaryDirectory()
         a = fluid.data(name="a", shape=[10, 10])
         conv_in = fluid.data(name="conv_in", shape=[None, 10, 10, 10])
 
@@ -144,9 +147,11 @@ def testLoadStaticModel(self):
         ) if not fluid.is_compiled_with_cuda() else fluid.CUDAPlace(0))
         out = exe.run(framework.default_startup_program())
 
-        fluid.save(framework.default_main_program(), "./test_1")
+        fluid.save(framework.default_main_program(),
+                   os.path.join(temp_dir.name, "test_1"))
 
-        para_dict = fluid.load_program_state("./test_1")
+        para_dict = fluid.load_program_state(
+            os.path.join(temp_dir.name, "test_1"))
 
         new_dict = {}
         for k, v in para_dict.items():
@@ -214,6 +219,7 @@ def __init__(self):
             my_test.set_dict(new_dict, use_structured_name=False)
             for k, v in my_test.state_dict().items():
                 self.assertTrue(np.array_equal(v.numpy(), new_dict[v.name]))
+        temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 91bb1b7e94fda..f0026f8ef3307 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -29,6 +29,8 @@
 import paddle
 from paddle.fluid.framework import _test_eager_guard
 
+import tempfile
+
 
 class SimpleLSTMRNN(fluid.Layer):
 
@@ -219,6 +221,12 @@ def forward(self, input, label, init_hidden, init_cell):
 
 class TestDygraphPtbRnn(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def func_setUp(self):
         seed = 90
         hidden_size = 10
@@ -295,7 +303,8 @@ def func_setUp(self):
                 else:
                     self.base_opti[k] = v
 
-            paddle.save(self.opti_dict, "./test_dy_v2.pdopt")
+            paddle.save(self.opti_dict,
+                        os.path.join(self.temp_dir.name, "test_dy_v2.pdopt"))
 
             self.state_dict = ptb_model.state_dict()
 
@@ -304,10 +313,10 @@ def func_setUp(self):
                 np_t = v.numpy()
                 self.model_base[k] = np_t
 
-            paddle.save(self.state_dict, "./test_dy_v2.pdparams")
+            paddle.save(self.state_dict,
+                        os.path.join(self.temp_dir.name, "test_dy_v2.pdparams"))
 
     def func_testLoadAndSetVarBase(self):
-        self.setUp()
         seed = 90
         hidden_size = 10
         vocab_size = 1000
@@ -383,8 +392,10 @@ def func_testLoadAndSetVarBase(self):
 
                     self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
 
-            para_state_dict = paddle.load("./test_dy_v2.pdparams")
-            opti_state_dict = paddle.load("./test_dy_v2.pdopt")
+            para_state_dict = paddle.load(
+                os.path.join(self.temp_dir.name, "test_dy_v2.pdparams"))
+            opti_state_dict = paddle.load(
+                os.path.join(self.temp_dir.name, "test_dy_v2.pdopt"))
             adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
@@ -752,7 +763,8 @@ def func_testLoadAndSetVarBaseBeforeTrain(self):
             last_hidden = None
             last_cell = None
 
-            state_dict, opti_dict = fluid.load_dygraph("./test_dy_v2")
+            state_dict, opti_dict = fluid.load_dygraph(
+                os.path.join(self.temp_dir.name, "test_dy_v2"))
             adam.set_state_dict(opti_dict)
             ptb_model.set_dict(state_dict)
 
@@ -907,19 +919,23 @@ def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
+            paddle.save(
+                state_dict,
+                os.path.join(self.temp_dir.name, 'saved_dy', 'emb_dy.pdparams'))
 
             para_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdparams'))
+                os.path.join(self.temp_dir.name, 'saved_dy', 'emb_dy.pdparams'))
 
     def func_test_no_state_in_input_dict(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
+            paddle.save(
+                state_dict,
+                os.path.join(self.temp_dir.name, 'saved_dy', 'emb_dy.pdparams'))
 
             para_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdparams'))
+                os.path.join(self.temp_dir.name, 'saved_dy', 'emb_dy.pdparams'))
             para_state_dict.pop('weight')
 
             emb.set_state_dict(para_state_dict)
@@ -928,9 +944,12 @@ def func_test_state_shape_mismatch(self):
         with fluid.dygraph.guard():
             emb = fluid.dygraph.Embedding([10, 10])
             state_dict = emb.state_dict()
-            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
+            paddle.save(
+                state_dict,
+                os.path.join(self.temp_dir.name, 'saved_dy', 'emb_dy.pdparams'))
 
-            para_state_dict = paddle.load(os.path.join('saved_dy',
+            para_state_dict = paddle.load(os.path.join(self.temp_dir.name,
+                                                       'saved_dy',
                                                        'emb_dy.pdparams'),
                                           return_numpy=True)
             para_state_dict['weight'] = np.expand_dims(
diff --git a/python/paddle/fluid/tests/unittests/test_io_save_load.py b/python/paddle/fluid/tests/unittests/test_io_save_load.py
index 0d5573ae7021a..995adcca73266 100644
--- a/python/paddle/fluid/tests/unittests/test_io_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_io_save_load.py
@@ -19,10 +19,19 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph
+import tempfile
+import os
 
 
 class TestSaveLoadAPIError(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.save_dir = os.path.join(self.temp_dir.name, "fake_dir")
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def func_test_get_valid_program_error(self):
         # case 1: CompiledProgram no program
         graph = core.Graph(core.ProgramDesc())
@@ -45,13 +54,13 @@ def func_test_load_vars_error(self):
         # case 1: main_program type error when vars None
         with self.assertRaises(TypeError):
             fluid.io.load_vars(executor=exe,
-                               dirname="./fake_dir",
+                               dirname=self.save_dir,
                                main_program="program")
 
         # case 2: main_program type error when vars not None
         with self.assertRaises(TypeError):
             fluid.io.load_vars(executor=exe,
-                               dirname="./fake_dir",
+                               dirname=self.save_dir,
                                main_program="program",
                                vars="vars")
 
@@ -63,6 +72,12 @@ def test_load_vars_error(self):
 
 class TestSaveInferenceModelAPIError(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def func_test_useless_feeded_var_names(self):
         start_prog = fluid.Program()
         main_prog = fluid.Program()
@@ -75,7 +90,8 @@ def func_test_useless_feeded_var_names(self):
         exe.run(start_prog)
         with self.assertRaisesRegexp(
                 ValueError, "not involved in the target_vars calculation"):
-            fluid.io.save_inference_model(dirname='./model',
+            fluid.io.save_inference_model(dirname=os.path.join(
+                self.temp_dir.name, 'model'),
                                           feeded_var_names=['x', 'y'],
                                           target_vars=[z],
                                           executor=exe,
@@ -89,13 +105,20 @@ def test_useless_feeded_var_names(self):
 
 class TestWhenTrainWithNoGrad(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def func_test_when_train_with_no_grad(self):
         paddle.disable_static()
         net = paddle.nn.Linear(1024, 1)
         net = paddle.jit.to_static(net)
         x = paddle.rand([1024], 'float32')
         net(x)
-        save_path = './train_with_no_grad'
+        save_path = os.path.join(self.temp_dir.name, 'train_with_no_grad')
+
         paddle.jit.save(net, save_path)
         net = paddle.jit.load(save_path)
         net.train()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 6c1bbdac68995..bf5ccf1a854ff 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -18,6 +18,7 @@
 import pickle
 import shutil
 import unittest
+import tempfile
 import numpy as np
 import paddle
 from paddle.static import InputSpec
@@ -347,13 +348,18 @@ def train_with_label(layer, input_size=784, label_size=1):
 class TestJitSaveLoad(unittest.TestCase):
 
     def setUp(self):
-        self.model_path = "test_jit_save_load/model"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "test_jit_save_load/model")
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def train_and_save_model(self, model_path=None):
         layer = LinearNet(784, 1)
         example_inputs, layer, _ = train(layer)
@@ -412,12 +418,14 @@ def load_dygraph_state_dict(self, train_layer):
                            new_layer(x).numpy()))
 
     def test_load_dygraph_no_path(self):
-        model_path = "test_jit_save_load.no_path/model_path"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_jit_save_load.no_path/model_path")
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
     def test_jit_load_no_path(self):
-        path = "test_jit_save_load.no_path/model_path"
+        path = os.path.join(self.temp_dir.name,
+                            "test_jit_save_load.no_path/model_path")
         with self.assertRaises(ValueError):
             loaded_layer = paddle.jit.load(path)
 
@@ -427,6 +435,10 @@ class TestSaveLoadWithNestOut(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_nest_output(self):
         x = fluid.dygraph.to_variable(
@@ -436,7 +448,7 @@ def test_nest_output(self):
         dy_outs = flatten(net(x))
         net = declarative(net, input_spec=[InputSpec([None, 8], name='x')])
 
-        model_path = "net_with_nest_out/model"
+        model_path = os.path.join(self.temp_dir.name, "net_with_nest_out/model")
         paddle.jit.save(net, model_path)
 
         load_net = paddle.jit.load(model_path)
@@ -459,8 +471,9 @@ def test_dict_input(self):
         #  {'img': var img : fluid.VarType.LOD_TENSOR.shape(-1, 8).astype(VarType.FP32)},
         #  {'label': var label : fluid.VarType.LOD_TENSOR.shape(-1, 1).astype(VarType.INT64)})
         self.assertEqual(len(net.forward.concrete_program.inputs), 3)
-
-        path = "test_jit_save_load_with_dict_input/model"
+        temp_dir = tempfile.TemporaryDirectory()
+        path = os.path.join(temp_dir.name,
+                            "test_jit_save_load_with_dict_input/model")
         # prune inputs
         paddle.jit.save(layer=net,
                         path=path,
@@ -478,14 +491,16 @@ def test_dict_input(self):
         # loaded_net._input_spec():
         # [InputSpec(shape=(-1, 8), dtype=VarType.FP32, name=img)]
         self.assertEqual(len(loaded_net._input_spec()), 1)
+        temp_dir.cleanup()
 
 
 class TestSaveLoadWithDictInputNoPrune(unittest.TestCase):
 
     def test_dict_input(self):
         net = LinearNetWithDictInputNoPrune(8, 8)
-
-        path = "test_jit_save_load_with_dict_input_no_prune/model"
+        temp_dir = tempfile.TemporaryDirectory()
+        path = os.path.join(
+            temp_dir.name, "test_jit_save_load_with_dict_input_no_prune/model")
         # prune inputs
         paddle.jit.save(layer=net,
                         path=path,
@@ -506,6 +521,7 @@ def test_dict_input(self):
         loaded_out = loaded_net(img, img2)
 
         self.assertEqual(len(loaded_net._input_spec()), 2)
+        temp_dir.cleanup()
 
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
@@ -513,6 +529,10 @@ class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_with_input_spec(self):
         net = LinearNetReturnLoss(8, 8)
@@ -520,7 +540,8 @@ def test_with_input_spec(self):
         net.forward = declarative(net.forward,
                                   input_spec=[InputSpec([None, 8], name='x')])
 
-        model_path = "input_spec.output_spec/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "input_spec.output_spec/model")
         # check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 1)
         input_x = net.forward.inputs[0]
@@ -540,7 +561,8 @@ def test_with_input_spec(self):
     def test_multi_in_out(self):
         net = LinearNetMultiInput(8, 8)
 
-        model_path = "multi_inout.output_spec1/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "multi_inout.output_spec1/model")
         # 1. check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 2)
         input_x = net.forward.inputs[0]
@@ -562,7 +584,8 @@ def test_multi_in_out(self):
         pred_x, pred_y = infer_layer(x, y)
 
         # 1. prune y and loss
-        model_path = "multi_inout.output_spec2/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "multi_inout.output_spec2/model")
         output_spec = net.forward.outputs[:1]
         paddle.jit.save(net, model_path, [input_x], output_spec=output_spec)
         # 2. load again
@@ -576,7 +599,8 @@ def test_multi_in_out(self):
     def test_multi_in_out1(self):
         net = LinearNetMultiInput1(8, 8)
 
-        model_path = "multi_inout1.output_spec1/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "multi_inout1.output_spec1/model")
         # 1. check inputs and outputs
         self.assertTrue(len(net.forward.inputs) == 2)
         input_x = net.forward.inputs[0]
@@ -598,7 +622,8 @@ def test_multi_in_out1(self):
         pred_x, pred_y = infer_layer(x, y)
 
         # 1. prune y and loss
-        model_path = "multi_inout1.output_spec2/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "multi_inout1.output_spec2/model")
         output_spec = net.forward.outputs[:1]
         paddle.jit.save(net, model_path, (input_x, ), output_spec=output_spec)
         # 2. load again
@@ -618,6 +643,10 @@ def setUp(self):
         # config seed
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_output_spec(self):
         train_layer = LinearNetReturnLoss(8, 8)
@@ -631,7 +660,8 @@ def test_output_spec(self):
             adam.minimize(loss)
             train_layer.clear_gradients()
 
-        model_path = "save_load_config.output_spec"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "save_load_config.output_spec")
         output_spec = [out]
         paddle.jit.save(layer=train_layer,
                         path=model_path,
@@ -648,22 +678,22 @@ def test_output_spec(self):
 
     def test_save_no_support_config_error(self):
         layer = LinearNet(784, 1)
-        path = "no_support_config_test"
+        path = os.path.join(self.temp_dir.name, "no_support_config_test")
         with self.assertRaises(ValueError):
             paddle.jit.save(layer=layer, path=path, model_filename="")
 
     def test_load_empty_model_filename_error(self):
-        path = "error_model_filename_test"
+        path = os.path.join(self.temp_dir.name, "error_model_filename_test")
         with self.assertRaises(ValueError):
             paddle.jit.load(path, model_filename="")
 
     def test_load_empty_params_filename_error(self):
-        path = "error_params_filename_test"
+        path = os.path.join(self.temp_dir.name, "error_params_filename_test")
         with self.assertRaises(ValueError):
             paddle.jit.load(path, params_filename="")
 
     def test_load_with_no_support_config(self):
-        path = "no_support_config_test"
+        path = os.path.join(self.temp_dir.name, "no_support_config_test")
         with self.assertRaises(ValueError):
             paddle.jit.load(path, separate_params=True)
 
@@ -672,7 +702,9 @@ class TestJitMultipleLoading(unittest.TestCase):
 
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "jit_multi_load/model"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "jit_multi_load/model")
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
@@ -681,6 +713,9 @@ def setUp(self):
         # train and save base model
         self.train_and_save_orig_model()
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def train_and_save_orig_model(self):
         layer = LinearNet(self.linear_size, self.linear_size)
         example_inputs, layer, _ = train(layer, self.linear_size, 1)
@@ -702,13 +737,18 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
 
     def setUp(self):
         self.linear_size = 4
-        self.model_path = "jit_prune_model_and_load/model"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "jit_prune_model_and_load/model")
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def train_and_save(self):
         train_layer = LinearNetReturnHidden(8, 8)
         adam = fluid.optimizer.AdamOptimizer(
@@ -764,6 +804,10 @@ def setUp(self):
         # config seed
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def verify_inference_correctness(self,
                                      layer,
@@ -796,7 +840,8 @@ def test_no_prune_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_to_static_after_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_no_prune_to_static_after_train/model")
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -804,7 +849,8 @@ def test_no_prune_to_static_after_train(self):
     def test_no_prune_to_static_no_train(self):
         layer = LinearNetWithInputSpec(784, 1)
 
-        model_path = "test_no_prune_to_static_no_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_no_prune_to_static_no_train/model")
         paddle.jit.save(layer, model_path)
 
         self.verify_inference_correctness(layer, model_path)
@@ -814,7 +860,8 @@ def test_no_prune_no_to_static_after_train(self):
 
         train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train/model"
+        model_path = os.path.join(
+            self.temp_dir.name, "test_no_prune_no_to_static_after_train/model")
         paddle.jit.save(
             layer,
             model_path,
@@ -827,7 +874,9 @@ def test_no_prune_no_to_static_after_train_with_examples(self):
 
         example_inputs, _, _ = train(layer)
 
-        model_path = "test_no_prune_no_to_static_after_train_with_examples/model"
+        model_path = os.path.join(
+            self.temp_dir.name,
+            "test_no_prune_no_to_static_after_train_with_examples/model")
         paddle.jit.save(layer=layer, path=model_path, input_spec=example_inputs)
 
         self.verify_inference_correctness(layer, model_path)
@@ -835,7 +884,8 @@ def test_no_prune_no_to_static_after_train_with_examples(self):
     def test_no_prune_no_to_static_no_train(self):
         layer = LinearNetNotDeclarative(784, 1)
 
-        model_path = "test_no_prune_no_to_static_no_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_no_prune_no_to_static_no_train/model")
         paddle.jit.save(
             layer,
             model_path,
@@ -848,7 +898,8 @@ def test_prune_to_static_after_train(self):
 
         out = train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_prune_to_static_after_train/model")
         paddle.jit.save(layer,
                         model_path,
                         input_spec=[
@@ -865,7 +916,8 @@ def test_prune_to_static_after_train(self):
     def test_prune_to_static_no_train(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_to_static_no_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_prune_to_static_no_train/model")
         # TODO: no train, cannot get output_spec var here
         # now only can use index
         output_spec = layer.forward.outputs[:1]
@@ -885,7 +937,8 @@ def test_prune_to_static_no_train(self):
     def test_prune_input_to_static_no_train(self):
         layer = LinerNetWithPruneInput(784, 1)
 
-        model_path = "test_prune_input_to_static_no_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_prune_input_to_static_no_train/model")
         paddle.jit.save(layer,
                         model_path,
                         input_spec=[
@@ -899,7 +952,9 @@ def test_prune_input_to_static_no_train(self):
     def test_prune_useless_input_to_static_no_train(self):
         layer = LinerNetWithUselessInput(784, 1)
 
-        model_path = "test_prune_useless_input_to_static_no_train/model"
+        model_path = os.path.join(
+            self.temp_dir.name,
+            "test_prune_useless_input_to_static_no_train/model")
         paddle.jit.save(layer,
                         model_path,
                         input_spec=[
@@ -915,7 +970,8 @@ def test_no_prune_input_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_no_prune_input_spec_name_warning/model"
+        model_path = os.path.join(
+            self.temp_dir.name, "test_no_prune_input_spec_name_warning/model")
         paddle.jit.save(
             layer,
             model_path,
@@ -935,7 +991,8 @@ def test_not_prune_output_spec_name_warning(self):
 
         train(layer)
 
-        model_path = "test_not_prune_output_spec_name_warning/model"
+        model_path = os.path.join(
+            self.temp_dir.name, "test_not_prune_output_spec_name_warning/model")
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         paddle.jit.save(layer, model_path, output_spec=[out])
 
@@ -944,7 +1001,8 @@ def test_not_prune_output_spec_name_warning(self):
     def test_prune_input_spec_name_error(self):
         layer = LinerNetWithLabel(784, 1)
 
-        model_path = "test_prune_input_spec_name_error/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_prune_input_spec_name_error/model")
         with self.assertRaises(ValueError):
             paddle.jit.save(
                 layer,
@@ -964,7 +1022,8 @@ def test_prune_output_spec_name_error(self):
 
         train_with_label(layer)
 
-        model_path = "test_prune_to_static_after_train/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_prune_to_static_after_train/model")
         out = paddle.to_tensor(np.random.random((1, 1)).astype('float'))
         with self.assertRaises(ValueError):
             paddle.jit.save(layer,
@@ -980,10 +1039,15 @@ def test_prune_output_spec_name_error(self):
 class TestJitSaveLoadEmptyLayer(unittest.TestCase):
 
     def setUp(self):
-        self.model_path = "jit_save_load_empty_layer/model"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "jit_save_load_empty_layer/model")
         # enable dygraph mode
         paddle.disable_static()
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_save_load_empty_layer(self):
         layer = EmptyLayer()
         x = paddle.to_tensor(np.random.random((10)).astype('float32'))
@@ -997,10 +1061,15 @@ def test_save_load_empty_layer(self):
 class TestJitSaveLoadNoParamLayer(unittest.TestCase):
 
     def setUp(self):
-        self.model_path = "jit_save_load_no_param_layer/model"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name,
+                                       "jit_save_load_no_param_layer/model")
         # enable dygraph mode
         paddle.disable_static()
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_save_load_no_param_layer(self):
         layer = NoParamLayer()
         x = paddle.to_tensor(np.random.random((5)).astype('float32'))
@@ -1017,9 +1086,14 @@ class TestJitSaveLoadMultiMethods(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_inference(self):
-        model_path_inference = "jit_save_load_multi_methods/model"
+        model_path_inference = os.path.join(
+            self.temp_dir.name, "jit_save_load_multi_methods/model")
         IMAGE_SIZE = 224
         layer = LinearNetWithMultiStaticFunc(IMAGE_SIZE, 10)
         inps = paddle.randn([1, IMAGE_SIZE])
@@ -1035,7 +1109,8 @@ def test_jit_save_load_inference(self):
                        getattr(load_net, func, None)(inps)).abs().max()) < 1e-5)
 
     def test_jit_save_load_multi_methods_inputspec(self):
-        model_path = 'jit_save_load_multi_methods/model'
+        model_path = os.path.join(self.temp_dir.name,
+                                  'jit_save_load_multi_methods/model')
         layer = LinearNetWithMultiStaticFunc(784, 1)
         with self.assertRaises(ValueError):
             paddle.jit.save(layer,
@@ -1043,7 +1118,8 @@ def test_jit_save_load_multi_methods_inputspec(self):
                             input_spec=[InputSpec(shape=[None, 784])])
 
     def test_parse_name(self):
-        model_path_inference = "jit_save_load_parse_name/model"
+        model_path_inference = os.path.join(self.temp_dir.name,
+                                            "jit_save_load_parse_name/model")
         IMAGE_SIZE = 224
         layer = LinearNet(IMAGE_SIZE, 1)
         inps = paddle.randn([1, IMAGE_SIZE])
@@ -1070,7 +1146,7 @@ def __init__(self, in_size, out_size):
     def forward(self, x):
         y = self._linear_0(x)
         # Multiple blocks
-        if x.shape[0] == 1:
+        if paddle.shape(x)[0] == 1:
             y = self._linear_1_0(y)
         else:
             y += self._linear_1_1(y + self._scale)
@@ -1097,7 +1173,7 @@ def forward(self, x):
         y = self._linear_0(x)
         y = self._load_l1(y)
         # Multiple blocks
-        if x.shape[0] == 1:
+        if paddle.shape(x)[0] == 1:
             y = self._linear_1_0(y)
             y = self._load_l1(y)
         else:
@@ -1116,9 +1192,14 @@ class TestJitSaveLoadSaveWithoutRunning(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_save_load_finetune_load(self):
-        model_path = "test_jit_save_load_save_without_running/model"
+        model_path = os.path.join(
+            self.temp_dir.name, "test_jit_save_load_save_without_running/model")
         IMAGE_SIZE = 224
         inps0 = paddle.randn([1, IMAGE_SIZE])
         inps1 = paddle.randn([2, IMAGE_SIZE])
@@ -1157,9 +1238,14 @@ class TestJitSaveLoadFinetuneLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_save_load_finetune_load(self):
-        model_path = "test_jit_save_load_finetune_load/model"
+        model_path = os.path.join(self.temp_dir.name,
+                                  "test_jit_save_load_finetune_load/model")
         IMAGE_SIZE = 224
         inps0 = paddle.randn([1, IMAGE_SIZE])
         inps1 = paddle.randn([2, IMAGE_SIZE])
@@ -1196,6 +1282,10 @@ class TestJitSaveLoadFunctionCase1(unittest.TestCase):
 
     def setUp(self):
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_static_function(self):
 
@@ -1203,7 +1293,8 @@ def test_jit_save_load_static_function(self):
         def fun(inputs):
             return paddle.tanh(inputs)
 
-        path = 'test_jit_save_load_function_1/func'
+        path = os.path.join(self.temp_dir.name,
+                            'test_jit_save_load_function_1/func')
         inps = paddle.rand([3, 6])
         origin = fun(inps)
 
@@ -1218,6 +1309,10 @@ class TestJitSaveLoadFunctionCase2(unittest.TestCase):
 
     def setUp(self):
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_function_input_spec(self):
 
@@ -1227,7 +1322,8 @@ def test_jit_save_load_function_input_spec(self):
         def fun(inputs):
             return paddle.nn.functional.relu(inputs)
 
-        path = 'test_jit_save_load_function_2/func'
+        path = os.path.join(self.temp_dir.name,
+                            'test_jit_save_load_function_2/func')
         inps = paddle.rand([3, 6])
         origin = fun(inps)
 
@@ -1241,13 +1337,18 @@ class TestJitSaveLoadFunctionCase3(unittest.TestCase):
 
     def setUp(self):
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_function_function(self):
 
         def fun(inputs):
             return paddle.tanh(inputs)
 
-        path = 'test_jit_save_load_function_3/func'
+        path = os.path.join(self.temp_dir.name,
+                            'test_jit_save_load_function_3/func')
         inps = paddle.rand([3, 6])
         origin = fun(inps)
 
@@ -1268,6 +1369,10 @@ class TestJitSaveLoadFunctionWithParamCase1(unittest.TestCase):
 
     def setUp(self):
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_function(self):
 
@@ -1290,7 +1395,9 @@ def anothor_forward(self, x):
 
         func = paddle.jit.to_static(layer.anothor_forward,
                                     [paddle.static.InputSpec(shape=[-1, 5])])
-        path = 'test_jit_save_load_function_with_params_case1/func'
+        path = os.path.join(
+            self.temp_dir.name,
+            'test_jit_save_load_function_with_params_case1/func')
         paddle.jit.save(func, path)
         load_func = paddle.jit.load(path)
 
@@ -1302,6 +1409,10 @@ class TestJitSaveLoadFunctionWithParamCase2(unittest.TestCase):
 
     def setUp(self):
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_function(self):
 
@@ -1322,7 +1433,9 @@ def anothor_forward(self, x):
 
         inps = paddle.rand([3, 5])
 
-        path = 'test_jit_save_load_function_with_params_case2/func'
+        path = os.path.join(
+            self.temp_dir.name,
+            'test_jit_save_load_function_with_params_case2/func')
         paddle.jit.save(layer.anothor_forward, path)
         origin_result = layer.anothor_forward(inps)
         load_func = paddle.jit.load(path)
@@ -1337,6 +1450,10 @@ class TestJitSaveLoadFunctionWithParamCase3(unittest.TestCase):
 
     def setUp(self):
         paddle.disable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_jit_save_load_function(self):
 
@@ -1358,7 +1475,9 @@ def anothor_forward(self, x):
         inps = paddle.rand([3, 5])
         origin = layer.anothor_forward(inps)
 
-        path = 'test_jit_save_load_function_with_params_case3/func'
+        path = os.path.join(
+            self.temp_dir.name,
+            'test_jit_save_load_function_with_params_case3/func')
         paddle.jit.save(layer.anothor_forward, path)
         load_func = paddle.jit.load(path)
 
@@ -1368,6 +1487,12 @@ def anothor_forward(self, x):
 
 class TestJitSaveLoadDataParallel(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def verify_inference_correctness(self, layer, path):
         layer.eval()
         loaded_layer = paddle.jit.load(path)
@@ -1384,8 +1509,8 @@ def verify_inference_correctness(self, layer, path):
     def test_jit_save_data_parallel_with_inputspec(self):
         layer = LinearNetNotDeclarative(784, 1)
         layer = paddle.DataParallel(layer)
-
-        path = "jit_save_data_parallel_with_inputspec/model"
+        path = os.path.join(self.temp_dir.name,
+                            "jit_save_data_parallel_with_inputspec/model")
         paddle.jit.save(layer=layer,
                         path=path,
                         input_spec=[InputSpec(shape=[None, 784])])
@@ -1396,7 +1521,8 @@ def test_jit_save_data_parallel_with_to_static(self):
         layer = LinearNetWithInputSpec(784, 1)
         layer = paddle.DataParallel(layer)
 
-        path = "jit_save_data_parallel_with_to_static/model"
+        path = os.path.join(self.temp_dir.name,
+                            "jit_save_data_parallel_with_to_static/model")
         paddle.jit.save(layer, path)
 
         self.verify_inference_correctness(layer, path)
@@ -1417,6 +1543,12 @@ def forward(self, x, y):
 
 class TestInputSpecCompatibility(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def _assert_input_spec_layer_return(self, expect_layer, test_layer):
         input_x = paddle.uniform([8, 8], dtype='float32')
         input_y = paddle.uniform([8, 1], dtype='float64')
@@ -1429,7 +1561,8 @@ def _assert_input_spec_layer_return(self, expect_layer, test_layer):
 
     def test_jit_save_compatible_input_sepc(self):
         layer = InputSepcLayer()
-        save_dir = "jit_save_compatible_input_spec"
+        save_dir = os.path.join(self.temp_dir.name,
+                                "jit_save_compatible_input_spec")
         path = save_dir + "/model"
 
         paddle.jit.save(layer=layer, path=path)
@@ -1463,7 +1596,8 @@ def test_jit_save_compatible_input_sepc(self):
 
     def test_jit_save_incompatible_input_sepc(self):
         layer = InputSepcLayer()
-        save_dir = "jit_save_compatible_input_spec"
+        save_dir = os.path.join(self.temp_dir.name,
+                                "jit_save_compatible_input_spec")
         path = save_dir + "/model"
 
         with self.assertRaises(ValueError):
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 1cc2906731bd8..2ee1a1ba76f7b 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -247,7 +247,6 @@ def test_with_place(place,
 
     def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
-
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(shape=[2, 3, 4, 5],
                                     begin_norm_axis=1,
@@ -288,6 +287,14 @@ def test_check_forward_backward_with_scale_and_bias(self):
                                     begin_norm_axis=1,
                                     has_scale=True,
                                     has_bias=True)
+        self.check_forward_backward(shape=[1, 128, 256, 256],
+                                    begin_norm_axis=3,
+                                    has_scale=True,
+                                    has_bias=True)
+        self.check_forward_backward(shape=[1, 256, 384],
+                                    begin_norm_axis=2,
+                                    has_scale=True,
+                                    has_bias=True)
 
 
 class TestLayerNormAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_load_op.py b/python/paddle/fluid/tests/unittests/test_load_op.py
index 2896ff218c7a4..a9865251355b9 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op.py
@@ -19,6 +19,8 @@
 from op_test import OpTest, randomize_probability
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
+import os
+import tempfile
 
 
 class TestLoadOp(unittest.TestCase):
@@ -26,6 +28,7 @@ class TestLoadOp(unittest.TestCase):
     """
 
     def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
         self.ones = np.ones((4, 4)).astype('float32')
         main_prog = fluid.Program()
         start_prog = fluid.Program()
@@ -41,15 +44,20 @@ def setUp(self):
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(start_prog)
         fluid.io.save_persistables(exe,
-                                   dirname="./model",
+                                   dirname=os.path.join(self.temp_dir.name,
+                                                        "./model"),
                                    main_program=main_prog)
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_load(self):
         main_prog = fluid.Program()
         start_prog = fluid.Program()
         with fluid.program_guard(main_prog, start_prog):
             var = layers.create_tensor(dtype='float32')
-            layers.load(var, file_path='./model/w')
+            layers.load(var,
+                        file_path=os.path.join(self.temp_dir.name, './model/w'))
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(start_prog)
diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
index f11c39b7b65df..8d7f65116b63f 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
@@ -51,6 +51,9 @@ def setUp(self):
                                    dirname=self.model_path,
                                    main_program=main_prog)
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_load_xpu(self):
         main_prog = fluid.Program()
         start_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
index ac88b7960541c..32029e561d0ba 100644
--- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -23,6 +23,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from test_imperative_base import new_program_scope
+import tempfile
 
 
 def convolutional_neural_network(img):
@@ -58,6 +59,7 @@ def static_train_net(img, label):
 class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
 
     def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
         self.seed = 90
         self.epoch_num = 1
         self.batch_size = 128
@@ -65,6 +67,9 @@ def setUp(self):
         # enable static mode
         paddle.enable_static()
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def train_and_save_model(self, only_params=False):
         with new_program_scope():
             startup_program = fluid.default_startup_program()
@@ -121,7 +126,8 @@ def check_load_state_dict(self, orig_dict, load_dict):
             self.assertTrue(np.array_equal(value, load_dict[var_name]))
 
     def test_load_default(self):
-        self.save_dirname = "static_mnist.load_state_dict.default"
+        self.save_dirname = os.path.join(
+            self.temp_dir.name, "static_mnist.load_state_dict.default")
         self.model_filename = None
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
@@ -133,7 +139,8 @@ def test_load_default(self):
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_model_filename(self):
-        self.save_dirname = "static_mnist.load_state_dict.model_filename"
+        self.save_dirname = os.path.join(
+            self.temp_dir.name, "static_mnist.load_state_dict.model_filename")
         self.model_filename = "static_mnist.model"
         self.params_filename = None
         orig_param_dict = self.train_and_save_model()
@@ -147,7 +154,8 @@ def test_load_with_model_filename(self):
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_param_filename(self):
-        self.save_dirname = "static_mnist.load_state_dict.param_filename"
+        self.save_dirname = os.path.join(
+            self.temp_dir.name, "static_mnist.load_state_dict.param_filename")
         self.model_filename = None
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
@@ -161,7 +169,9 @@ def test_load_with_param_filename(self):
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_with_model_and_param_filename(self):
-        self.save_dirname = "static_mnist.load_state_dict.model_and_param_filename"
+        self.save_dirname = os.path.join(
+            self.temp_dir.name,
+            "static_mnist.load_state_dict.model_and_param_filename")
         self.model_filename = "static_mnist.model"
         self.params_filename = "static_mnist.params"
         orig_param_dict = self.train_and_save_model()
@@ -178,7 +188,8 @@ def test_load_with_model_and_param_filename(self):
         self.check_load_state_dict(orig_param_dict, new_load_param_dict)
 
     def test_load_state_dict_from_save_params(self):
-        self.save_dirname = "static_mnist.load_state_dict.save_params"
+        self.save_dirname = os.path.join(
+            self.temp_dir.name, "static_mnist.load_state_dict.save_params")
         self.params_filename = None
         orig_param_dict = self.train_and_save_model(True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_logcumsumexp_op.py b/python/paddle/fluid/tests/unittests/test_logcumsumexp_op.py
new file mode 100644
index 0000000000000..ebc350d13c673
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_logcumsumexp_op.py
@@ -0,0 +1,272 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from typing import Optional
+import unittest
+import itertools
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
+from op_test import OpTest
+
+
+def np_naive_logcumsumexp(x: np.ndarray, axis: Optional[int] = None):
+    return np.log(np.cumsum(np.exp(x), axis=axis))
+
+
+def np_logcumsumexp(x: np.ndarray,
+                    axis: Optional[int] = None,
+                    flatten: Optional[bool] = None,
+                    reverse: bool = False,
+                    exclusive: bool = False):
+    # `flatten` aligns with c++ op
+    if flatten:
+        assert axis in [0, None]
+        axis = None
+
+    x = np.copy(x)
+
+    if axis is None:
+        x = x.flatten()
+        axis = 0
+
+    if reverse:
+        x = np.flip(x, axis)
+
+    dimensions = [range(dim) for dim in x.shape[:axis]]
+
+    if exclusive:
+        x = np.roll(x, 1, axis)
+        for prefix_dim in itertools.product(*dimensions):
+            x[prefix_dim][0] = np.finfo(x.dtype).min
+
+    for prefix_dim in itertools.product(*dimensions):
+        arr = x[prefix_dim]
+        for dim in range(1, arr.shape[0]):
+            arr[dim] = np.logaddexp(arr[dim - 1], arr[dim])
+
+    if reverse:
+        x = np.flip(x, axis)
+
+    return x
+
+
+def np_logcumsumexp_grad(
+    x: np.ndarray,
+    dout: np.ndarray,
+    axis: Optional[int] = None,
+    flatten: Optional[bool] = None,
+    reverse: bool = False,
+    exclusive: bool = False,
+):
+    out = np_logcumsumexp(x, axis, flatten, reverse, exclusive)
+    log_grad_positive = np.where(dout > 0, np.log(dout), np.finfo(x.dtype).min)
+    log_grad_negative = np.where(dout < 0, np.log(-dout), np.finfo(x.dtype).min)
+
+    output_pos = np.exp(
+        np_logcumsumexp(log_grad_positive - out,
+                        axis=axis,
+                        flatten=flatten,
+                        reverse=not reverse,
+                        exclusive=exclusive).reshape(x.shape) + x)
+    output_neg = np.exp(
+        np_logcumsumexp(log_grad_negative - out,
+                        axis=axis,
+                        flatten=flatten,
+                        reverse=not reverse,
+                        exclusive=exclusive).reshape(x.shape) + x)
+
+    return output_pos - output_neg
+
+
+class TestLogcumsumexp(unittest.TestCase):
+
+    def run_imperative(self):
+        data_np = np.arange(12, dtype=np.float32).reshape(3, 4)
+        data = paddle.to_tensor(data_np)
+
+        y = paddle.logcumsumexp(data)
+        z = np_logcumsumexp(data_np)
+        self.assertTrue(np.allclose(z, y.numpy()))
+
+        y = paddle.logcumsumexp(data, axis=0)
+        z = np_logcumsumexp(data_np, axis=0)
+        self.assertTrue(np.allclose(z, y.numpy()))
+
+        y = paddle.logcumsumexp(data, axis=-1)
+        z = np_logcumsumexp(data_np, axis=-1)
+        self.assertTrue(np.allclose(z, y.numpy()))
+
+        y = paddle.logcumsumexp(data, dtype='float32')
+        self.assertTrue(y.dtype == core.VarDesc.VarType.FP32)
+
+        y = paddle.logcumsumexp(data, axis=-2)
+        z = np_logcumsumexp(data_np, axis=-2)
+        self.assertTrue(np.allclose(z, y.numpy()))
+
+        with self.assertRaises(IndexError):
+            y = paddle.logcumsumexp(data, axis=-3)
+
+        with self.assertRaises(IndexError):
+            y = paddle.logcumsumexp(data, axis=2)
+
+        data_np = np.arange(10000, 10024, dtype=np.float32)
+        data = paddle.to_tensor(data_np)
+        y = paddle.logcumsumexp(data)
+        z = np_naive_logcumsumexp(data_np)
+        # check that naive algorithm overflows
+        self.assertTrue(all(z == np.inf))
+        z = np_logcumsumexp(data_np)
+        # check that our algorithm doesn't overflow
+        self.assertTrue(all(z != np.inf))
+        self.assertTrue(np.allclose(z, y.numpy()))
+
+    def run_static(self, use_gpu=False):
+        with fluid.program_guard(fluid.Program()):
+            data_np = np.random.random((5, 4)).astype(np.float32)
+            x = paddle.static.data('X', [5, 4])
+            y = paddle.logcumsumexp(x)
+            y2 = paddle.logcumsumexp(x, axis=0)
+            y3 = paddle.logcumsumexp(x, axis=-1)
+            y4 = paddle.logcumsumexp(x, dtype='float64')
+            y5 = paddle.logcumsumexp(x, axis=-2)
+
+            place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            out = exe.run(feed={'X': data_np},
+                          fetch_list=[
+                              y.name,
+                              y2.name,
+                              y3.name,
+                              y4.name,
+                              y5.name,
+                          ])
+
+            z = np_logcumsumexp(data_np)
+            self.assertTrue(np.allclose(z, out[0]))
+            z = np_logcumsumexp(data_np, axis=0)
+            self.assertTrue(np.allclose(z, out[1]))
+            z = np_logcumsumexp(data_np, axis=-1)
+            self.assertTrue(np.allclose(z, out[2]))
+            self.assertTrue(out[3].dtype == np.float64)
+            z = np_logcumsumexp(data_np, axis=-2)
+            self.assertTrue(np.allclose(z, out[4]))
+
+    def test_cpu(self):
+        paddle.disable_static(paddle.fluid.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        self.run_static()
+
+    def test_gpu(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        paddle.disable_static(paddle.fluid.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        self.run_static(use_gpu=True)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.logcumsumexp(x, name='out')
+            self.assertTrue('out' in y.name)
+
+    def test_type_error(self):
+        with fluid.program_guard(fluid.Program()):
+
+            with self.assertRaises(TypeError):
+                data_np = np.random.random((100, 100), dtype=np.int32)
+                x = paddle.static.data('X', [100, 100], dtype='int32')
+                y = paddle.logcumsumexp(x)
+
+                place = fluid.CUDAPlace(0)
+                exe = fluid.Executor(place)
+                exe.run(fluid.default_startup_program())
+                out = exe.run(feed={'X': data_np}, fetch_list=[y.name])
+
+
+class BaseTestCases:
+
+    class BaseOpTest(OpTest):
+
+        def setUp(self):
+            self.op_type = "logcumsumexp"
+            input, attrs = self.input_and_attrs()
+            self.inputs = {'X': input}
+            self.attrs = attrs
+            self.outputs = {'Out': np_logcumsumexp(input, **attrs)}
+
+        def test_check_output(self):
+            self.check_output()
+
+        def test_check_grad(self):
+            self.check_grad(['X'],
+                            'Out',
+                            user_defined_grads=[
+                                np_logcumsumexp_grad(self.inputs['X'],
+                                                     1 / self.inputs['X'].size,
+                                                     **self.attrs)
+                            ])
+
+        def input_and_attrs(self):
+            raise NotImplementedError()
+
+
+class TestLogcumsumexpOp1(BaseTestCases.BaseOpTest):
+
+    def input_and_attrs(self):
+        return np.arange(100, dtype=np.float64).reshape(10, 10), {
+            'axis': 0,
+            'flatten': True,
+            'reverse': True
+        }
+
+
+class TestLogcumsumexpOp2(BaseTestCases.BaseOpTest):
+
+    def input_and_attrs(self):
+        return np.arange(100, dtype=np.float64).reshape(10, 10), {
+            'axis': 1,
+            'reverse': True
+        }
+
+
+class TestLogcumsumexpOp3(BaseTestCases.BaseOpTest):
+
+    def input_and_attrs(self):
+        return np.arange(100, dtype=np.float64).reshape(10, 10), {'axis': 1}
+
+
+class TestLogcumsumexpOp4(BaseTestCases.BaseOpTest):
+
+    def input_and_attrs(self):
+        return np.arange(100, dtype=np.float64).reshape(10, 10), {
+            'axis': 0,
+            'flatten': True,
+            'reverse': True,
+            'exclusive': True
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 4da22817be296..08e7f8502dccf 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -22,6 +22,7 @@
 import multiprocessing
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.io import Dataset, BatchSampler, DataLoader
 
@@ -182,7 +183,7 @@ def test_main(self):
 
 class TestStaticDataLoaderReturnList(unittest.TestCase):
 
-    def test_single_place(self):
+    def run_single_place(self, num_workers):
         scope = fluid.Scope()
         image = fluid.data(name='image',
                            shape=[None, IMAGE_SIZE],
@@ -192,7 +193,7 @@ def test_single_place(self):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
             dataloader = DataLoader(dataset,
                                     feed_list=[image, label],
-                                    num_workers=0,
+                                    num_workers=num_workers,
                                     batch_size=BATCH_SIZE,
                                     drop_last=True,
                                     return_list=True)
@@ -203,7 +204,7 @@ def test_single_place(self):
                 assert not isinstance(d[0], list)
                 assert not isinstance(d[1], list)
 
-    def test_multi_place(self):
+    def run_multi_place(self, num_workers):
         scope = fluid.Scope()
         image = fluid.data(name='image',
                            shape=[None, IMAGE_SIZE],
@@ -213,7 +214,7 @@ def test_multi_place(self):
             dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
             dataloader = DataLoader(dataset,
                                     feed_list=[image, label],
-                                    num_workers=0,
+                                    num_workers=num_workers,
                                     batch_size=BATCH_SIZE,
                                     places=[fluid.CPUPlace()] * 2,
                                     drop_last=True,
@@ -225,6 +226,12 @@ def test_multi_place(self):
                 assert isinstance(d[0], list)
                 assert isinstance(d[1], list)
 
+    def test_main(self):
+        paddle.enable_static()
+        for num_workers in [0, 2]:
+            self.run_single_place(num_workers)
+            self.run_multi_place(num_workers)
+
 
 class RandomBatchedDataset(Dataset):
 
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 3cf35550c5819..d3052b719ae2d 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -19,6 +19,7 @@
 import os
 import sys
 from io import BytesIO
+import tempfile
 
 import paddle
 import paddle.nn as nn
@@ -95,7 +96,10 @@ def train(layer, loader, loss_fn, opt):
 class TestSaveLoadLargeParameters(unittest.TestCase):
 
     def setUp(self):
-        pass
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_large_parameters_paddle_save(self):
         # enable dygraph mode
@@ -105,7 +109,8 @@ def test_large_parameters_paddle_save(self):
         layer = LayerWithLargeParameters()
         save_dict = layer.state_dict()
 
-        path = os.path.join("test_paddle_save_load_large_param_save",
+        path = os.path.join(self.temp_dir.name,
+                            "test_paddle_save_load_large_param_save",
                             "layer.pdparams")
         protocol = 4
         paddle.save(save_dict, path, protocol=protocol)
@@ -117,6 +122,12 @@ def test_large_parameters_paddle_save(self):
 
 class TestSaveLoadPickle(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_pickle_protocol(self):
         # enable dygraph mode
         paddle.disable_static()
@@ -124,7 +135,8 @@ def test_pickle_protocol(self):
         layer = LinearNet()
         save_dict = layer.state_dict()
 
-        path = os.path.join("test_paddle_save_load_pickle_protocol",
+        path = os.path.join(self.temp_dir.name,
+                            "test_paddle_save_load_pickle_protocol",
                             "layer.pdparams")
 
         with self.assertRaises(ValueError):
@@ -152,6 +164,12 @@ def test_pickle_protocol(self):
 
 class TestSaveLoadAny(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def set_zero(self, prog, place, scope=None):
         if scope is None:
             scope = fluid.global_scope()
@@ -213,7 +231,8 @@ def test_replace_static_save_load(self):
                     t = np.array(fluid.global_scope().find_var(
                         var.name).get_tensor())
                     base_map[var.name] = t
-            path = os.path.join("test_replace_static_save_load", "model")
+            path = os.path.join(self.temp_dir.name,
+                                "test_replace_static_save_load", "model")
             # paddle.save, legacy paddle.fluid.load
             self.replace_static_save(prog, path)
             self.set_zero(prog, place)
@@ -239,7 +258,9 @@ def test_replace_static_save_load(self):
             for var in prog.list_vars():
                 if var.persistable:
                     tensor = var.get_value(fluid.global_scope())
-                    paddle.save(tensor, os.path.join(path_vars, var.name))
+                    paddle.save(
+                        tensor,
+                        os.path.join(self.temp_dir.name, path_vars, var.name))
             with self.assertRaises(TypeError):
                 var.get_value('fluid.global_scope()')
             with self.assertRaises(ValueError):
@@ -257,7 +278,8 @@ def test_replace_static_save_load(self):
             self.set_zero(prog, place)
             for var in prog.list_vars():
                 if var.persistable:
-                    tensor = paddle.load(os.path.join(path_vars, var.name),
+                    tensor = paddle.load(os.path.join(self.temp_dir.name,
+                                                      path_vars, var.name),
                                          return_numpy=False)
                     var.set_value(tensor)
                     new_t = np.array(fluid.global_scope().find_var(
@@ -293,7 +315,8 @@ def get_lr(self):
         y.mean().backward()
         adam.step()
         state_dict = adam.state_dict()
-        path = 'paddle_save_load_v2/model.pdparams'
+        path = os.path.join(self.temp_dir.name,
+                            'paddle_save_load_v2/model.pdparams')
         with self.assertRaises(TypeError):
             paddle.save(state_dict, path, use_binary_format='False')
         # legacy paddle.save, paddle.load
@@ -317,7 +340,8 @@ def test_single_pickle_var_dygraph(self):
         # enable dygraph mode
         paddle.disable_static()
         layer = LinearNet()
-        path = 'paddle_save_load_v2/var_dygraph'
+        path = os.path.join(self.temp_dir.name,
+                            'paddle_save_load_v2/var_dygraph')
         tensor = layer._linear.weight
         with self.assertRaises(ValueError):
             paddle.save(tensor, path, pickle_protocol='3')
@@ -361,7 +385,8 @@ def test_single_pickle_var_static(self):
                     break
             scope = fluid.global_scope()
         origin_tensor = np.array(tensor)
-        path = 'test_single_pickle_var_static/var'
+        path = os.path.join(self.temp_dir.name,
+                            'test_single_pickle_var_static/var')
         paddle.save(tensor, path)
         self.set_zero(prog, place, scope)
         # static load
@@ -383,7 +408,8 @@ def test_single_pickle_var_static(self):
 
     def test_dygraph_save_static_load(self):
         inps = np.random.randn(1, IMAGE_SIZE).astype('float32')
-        path = 'test_dygraph_save_static_load/dy-static.pdparams'
+        path = os.path.join(self.temp_dir.name,
+                            'test_dygraph_save_static_load/dy-static.pdparams')
         paddle.disable_static()
         with paddle.utils.unique_name.guard():
             layer = LinearNet()
@@ -427,10 +453,14 @@ def test_save_load_complex_object_dygraph_save(self):
                 })
         obj4 = (np.random.randn(5, 6), (123, ))
 
-        path1 = "test_save_load_any_complex_object_dygraph/obj1"
-        path2 = "test_save_load_any_complex_object_dygraph/obj2"
-        path3 = "test_save_load_any_complex_object_dygraph/obj3"
-        path4 = "test_save_load_any_complex_object_dygraph/obj4"
+        path1 = os.path.join(self.temp_dir.name,
+                             "test_save_load_any_complex_object_dygraph/obj1")
+        path2 = os.path.join(self.temp_dir.name,
+                             "test_save_load_any_complex_object_dygraph/obj2")
+        path3 = os.path.join(self.temp_dir.name,
+                             "test_save_load_any_complex_object_dygraph/obj3")
+        path4 = os.path.join(self.temp_dir.name,
+                             "test_save_load_any_complex_object_dygraph/obj4")
         paddle.save(obj1, path1)
         paddle.save(obj2, path2)
         paddle.save(obj3, path3)
@@ -597,10 +627,18 @@ def test_save_load_complex_object_static_save(self):
             })
             obj4 = (np.ndarray([3, 4], dtype="float32"), )
 
-            path1 = "test_save_load_any_complex_object_static/obj1"
-            path2 = "test_save_load_any_complex_object_static/obj2"
-            path3 = "test_save_load_any_complex_object_static/obj3"
-            path4 = "test_save_load_any_complex_object_static/obj4"
+            path1 = os.path.join(
+                self.temp_dir.name,
+                "test_save_load_any_complex_object_static/obj1")
+            path2 = os.path.join(
+                self.temp_dir.name,
+                "test_save_load_any_complex_object_static/obj2")
+            path3 = os.path.join(
+                self.temp_dir.name,
+                "test_save_load_any_complex_object_static/obj3")
+            path4 = os.path.join(
+                self.temp_dir.name,
+                "test_save_load_any_complex_object_static/obj4")
             paddle.save(obj1, path1)
             paddle.save(obj2, path2)
             paddle.save(obj3, path3)
@@ -763,7 +801,8 @@ def test_save_load_complex_object_static_save(self):
     def test_varbase_binary_var(self):
         paddle.disable_static()
         varbase = paddle.randn([3, 2], dtype='float32')
-        path = 'test_paddle_save_load_varbase_binary_var/varbase'
+        path = os.path.join(self.temp_dir.name,
+                            'test_paddle_save_load_varbase_binary_var/varbase')
         paddle.save(varbase, path, use_binary_format=True)
         load_array = paddle.load(path, return_numpy=True)
         load_tensor = paddle.load(path, return_numpy=False)
@@ -851,6 +890,10 @@ def setUp(self):
         # config seed
         paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def build_and_train_model(self):
         # create network
@@ -878,8 +921,10 @@ def test_save_load(self):
         layer, opt = self.build_and_train_model()
 
         # save
-        layer_save_path = "test_paddle_save_load.linear.pdparams"
-        opt_save_path = "test_paddle_save_load.linear.pdopt"
+        layer_save_path = os.path.join(self.temp_dir.name,
+                                       "test_paddle_save_load.linear.pdparams")
+        opt_save_path = os.path.join(self.temp_dir.name,
+                                     "test_paddle_save_load.linear.pdopt")
         layer_state_dict = layer.state_dict()
         opt_state_dict = opt.state_dict()
 
@@ -895,7 +940,9 @@ def test_save_load(self):
 
         # test save load in static mode
         paddle.enable_static()
-        static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams"
+        static_save_path = os.path.join(
+            self.temp_dir.name,
+            "static_mode_test/test_paddle_save_load.linear.pdparams")
         paddle.save(layer_state_dict, static_save_path)
         load_static_state_dict = paddle.load(static_save_path)
         self.check_load_state_dict(layer_state_dict, load_static_state_dict)
@@ -906,21 +953,30 @@ def test_save_load(self):
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
-            paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/")
+            paddle.save(
+                layer_state_dict,
+                os.path.join(self.temp_dir.name,
+                             "test_paddle_save_load.linear.model/"))
 
         # 3. test load path not exist error
         with self.assertRaises(ValueError):
-            paddle.load("test_paddle_save_load.linear.params")
+            paddle.load(
+                os.path.join(self.temp_dir.name,
+                             "test_paddle_save_load.linear.params"))
 
         # 4. test load old save path error
         with self.assertRaises(ValueError):
-            paddle.load("test_paddle_save_load.linear")
+            paddle.load(
+                os.path.join(self.temp_dir.name,
+                             "test_paddle_save_load.linear"))
 
 
 class TestSaveLoadProgram(unittest.TestCase):
 
     def test_save_load_program(self):
         paddle.enable_static()
+        temp_dir = tempfile.TemporaryDirectory()
+
         with new_program_scope():
             layer = LinearNet()
             data = paddle.static.data(name='x_static_save',
@@ -931,8 +987,12 @@ def test_save_load_program(self):
             startup_program = paddle.static.default_startup_program()
             origin_main = main_program.desc.serialize_to_string()
             origin_startup = startup_program.desc.serialize_to_string()
-            path1 = "test_paddle_save_load_program/main_program.pdmodel"
-            path2 = "test_paddle_save_load_program/startup_program.pdmodel"
+            path1 = os.path.join(
+                temp_dir.name,
+                "test_paddle_save_load_program/main_program.pdmodel")
+            path2 = os.path.join(
+                temp_dir.name,
+                "test_paddle_save_load_program/startup_program.pdmodel")
             paddle.save(main_program, path1)
             paddle.save(startup_program, path2)
 
@@ -941,12 +1001,14 @@ def test_save_load_program(self):
             load_startup = paddle.load(path2).desc.serialize_to_string()
             self.assertTrue(origin_main == load_main)
             self.assertTrue(origin_startup == load_startup)
+        temp_dir.cleanup()
 
 
 class TestSaveLoadLayer(unittest.TestCase):
 
     def test_save_load_layer(self):
         paddle.disable_static()
+        temp_dir = tempfile.TemporaryDirectory()
         inps = paddle.randn([1, IMAGE_SIZE], dtype='float32')
         layer1 = LinearNet()
         layer2 = LinearNet()
@@ -954,9 +1016,11 @@ def test_save_load_layer(self):
         layer2.eval()
         origin_layer = (layer1, layer2)
         origin = (layer1(inps), layer2(inps))
-        path = "test_save_load_layer_/layer.pdmodel"
+        path = os.path.join(temp_dir.name,
+                            "test_save_load_layer_/layer.pdmodel")
         with self.assertRaises(ValueError):
             paddle.save(origin_layer, path)
+        temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
index bba65e469abe1..c7ac11546e12e 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -21,6 +21,7 @@
 import sys
 import six
 import platform
+import tempfile
 
 import paddle
 import paddle.nn as nn
@@ -38,6 +39,10 @@ class TestSaveLoadBinaryFormat(unittest.TestCase):
     def setUp(self):
         # enable static graph mode
         paddle.enable_static()
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def set_zero(self, prog, place, scope=None):
         if scope is None:
@@ -97,7 +102,8 @@ def test_replace_save_load_vars(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
             # test for replace_save_vars/io.load_vars
-            path_vars1 = 'test_replace_save_load_vars_binary1/model'
+            path_vars1 = os.path.join(
+                self.temp_dir.name, 'test_replace_save_load_vars_binary1/model')
             self.replace_save_vars(prog, path_vars1)
             # set var to zero
             self.set_zero(prog, place)
@@ -116,7 +122,9 @@ def test_replace_save_load_vars(self):
 
                     self.assertTrue(np.array_equal(new_t, base_t))
             # test for io.save_vars/replace_load_vars
-            path_vars2 = 'test_replace_save_load_vars_binary2/model/'
+            path_vars2 = os.path.join(
+                self.temp_dir.name,
+                'test_replace_save_load_vars_binary2/model/')
             fluid.io.save_vars(exe,
                                path_vars2,
                                main_program=prog,
@@ -149,7 +157,8 @@ def test_save_load_lod_tensor(self):
             prog = paddle.static.default_main_program()
             exe.run(fluid.default_startup_program())
 
-            dirname = 'test_save_load_lod_tensor1/tensor_'
+            dirname = os.path.join(self.temp_dir.name,
+                                   'test_save_load_lod_tensor1/tensor_')
             for var in prog.list_vars():
                 if var.persistable and list(
                         var.shape) == [IMAGE_SIZE, OUTPUT_NUM]:
@@ -171,12 +180,13 @@ def test_save_load_lod_tensor(self):
             self.assertTrue(np.array_equal(origin, to_array))
 
         with self.assertRaises(NotImplementedError):
-            path = 'test_save_load_error/temp'
+            path = os.path.join(self.temp_dir.name, 'test_save_load_error/temp')
             paddle.save({}, path, use_binary_format=True)
         # On the Windows platform, when parsing a string that can't be parsed as a `Program`, `desc_.ParseFromString` has a timeout risk.
         if 'Windows' != platform.system():
             with self.assertRaises(ValueError):
-                path = 'test_save_load_error/temp'
+                path = os.path.join(self.temp_dir.name,
+                                    'test_save_load_error/temp')
                 with open(path, "w") as f:
                     f.write('\0')
                 paddle.load(path)
@@ -187,11 +197,17 @@ def test_save_load_lod_tensor(self):
 
         with self.assertRaises(RuntimeError):
             fluid.core.save_lod_tensor(
-                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+                temp_lod,
+                os.path.join(
+                    self.temp_dir.name,
+                    'test_save_load_error_not_exist_file/not_exist_file'))
 
         with self.assertRaises(RuntimeError):
             fluid.core.load_lod_tensor(
-                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+                temp_lod,
+                os.path.join(
+                    self.temp_dir.name,
+                    'test_save_load_error_not_exist_file/not_exist_file'))
 
         # save to memory
         byio = BytesIO()
@@ -215,7 +231,8 @@ def test_save_load_selected_rows(self):
         rows = [0, 4, 7]
         row_numel = 12
         selected_rows = fluid.core.SelectedRows(rows, height)
-        path = 'test_paddle_save_load_selected_rows/sr.pdsr'
+        path = os.path.join(self.temp_dir.name,
+                            'test_paddle_save_load_selected_rows/sr.pdsr')
 
         with self.assertRaises(ValueError):
             paddle.save(selected_rows, path, use_binary_format=True)
@@ -236,11 +253,15 @@ def test_save_load_selected_rows(self):
         with self.assertRaises(RuntimeError):
             fluid.core.save_selected_rows(
                 selected_rows,
-                'test_paddle_save_load_selected_rows_not_exist_file/temp')
+                os.path.join(
+                    self.temp_dir.name,
+                    'test_paddle_save_load_selected_rows_not_exist_file/temp'))
         with self.assertRaises(RuntimeError):
             fluid.core.load_selected_rows(
                 selected_rows,
-                'test_paddle_save_load_selected_rows_not_exist_file/temp')
+                os.path.join(
+                    self.temp_dir.name,
+                    'test_paddle_save_load_selected_rows_not_exist_file/temp'))
 
         # save to memory
         byio = BytesIO()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_model.py b/python/paddle/fluid/tests/unittests/test_sparse_model.py
new file mode 100644
index 0000000000000..90f30e383174c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_model.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.incubate import sparse
+from paddle.incubate.sparse import nn
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestGradientAdd(unittest.TestCase):
+
+    def sparse(self, sp_x):
+        indentity = sp_x
+        out = nn.functional.relu(sp_x)
+        values = out.values() + indentity.values()
+        out = sparse.sparse_coo_tensor(out.indices(),
+                                       values,
+                                       shape=out.shape,
+                                       stop_gradient=out.stop_gradient)
+        return out
+
+    def dense(self, x):
+        indentity = x
+        out = paddle.nn.functional.relu(x)
+        out = out + indentity
+        return out
+
+    def test(self):
+        with _test_eager_guard():
+            x = paddle.randn((3, 3))
+            sparse_x = x.to_sparse_coo(sparse_dim=2)
+
+            x.stop_gradient = False
+            sparse_x.stop_gradient = False
+
+            dense_out = self.dense(x)
+            loss = dense_out.mean()
+            loss.backward(retain_graph=True)
+
+            sparse_out = self.sparse(sparse_x)
+            sparse_loss = sparse_out.values().mean()
+            sparse_loss.backward(retain_graph=True)
+
+            assert np.allclose(dense_out.numpy(), sparse_out.to_dense().numpy())
+            assert np.allclose(loss.numpy(), loss.numpy())
+            assert np.allclose(x.grad.numpy(), sparse_x.grad.to_dense().numpy())
+
+            loss.backward()
+            sparse_loss.backward()
+
+            assert np.allclose(x.grad.numpy(), sparse_x.grad.to_dense().numpy())
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 5705763e0af5f..a72757d5005bd 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -168,31 +168,33 @@ def test_coo_to_dense(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                paddle.to_tensor(indices),
-                paddle.to_tensor(values),
-                shape=[3, 4],
-                stop_gradient=False)
-            dense_tensor = sparse_x.to_dense()
-            #test to_dense_grad backward
-            out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
-                        [9.0, 10.0, 11.0, 12.0]]
-            dense_tensor.backward(paddle.to_tensor(out_grad))
-            #mask the out_grad by sparse_x.indices()
-            correct_x_grad = [2.0, 4.0, 7.0, 9.0, 10.0]
-            assert np.array_equal(correct_x_grad,
-                                  sparse_x.grad.values().numpy())
-
-            paddle.device.set_device("cpu")
-            sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor(
-                paddle.to_tensor(indices),
-                paddle.to_tensor(values),
-                shape=[3, 4],
-                stop_gradient=False)
-            dense_tensor_cpu = sparse_x_cpu.to_dense()
-            dense_tensor_cpu.backward(paddle.to_tensor(out_grad))
-            assert np.array_equal(correct_x_grad,
-                                  sparse_x_cpu.grad.values().numpy())
+            indices_dtypes = ['int32', 'int64']
+            for indices_dtype in indices_dtypes:
+                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                    paddle.to_tensor(indices, dtype=indices_dtype),
+                    paddle.to_tensor(values),
+                    shape=[3, 4],
+                    stop_gradient=False)
+                dense_tensor = sparse_x.to_dense()
+                #test to_dense_grad backward
+                out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                            [9.0, 10.0, 11.0, 12.0]]
+                dense_tensor.backward(paddle.to_tensor(out_grad))
+                #mask the out_grad by sparse_x.indices()
+                correct_x_grad = [2.0, 4.0, 7.0, 9.0, 10.0]
+                assert np.array_equal(correct_x_grad,
+                                      sparse_x.grad.values().numpy())
+
+                paddle.device.set_device("cpu")
+                sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor(
+                    paddle.to_tensor(indices, dtype=indices_dtype),
+                    paddle.to_tensor(values),
+                    shape=[3, 4],
+                    stop_gradient=False)
+                dense_tensor_cpu = sparse_x_cpu.to_dense()
+                dense_tensor_cpu.backward(paddle.to_tensor(out_grad))
+                assert np.array_equal(correct_x_grad,
+                                      sparse_x_cpu.grad.values().numpy())
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_to_sparse_csr(self):
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index 9c44785d1c469..8a4f8f9201317 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -30,6 +30,7 @@
 import pickle
 import os
 import errno
+import tempfile
 
 paddle.enable_static()
 
@@ -240,6 +241,7 @@ def test_ptb_rnn_cpu_float32(self):
         init_scale = 0.1
         batch_size = 4
         batch_num = 200
+        temp_dir = tempfile.TemporaryDirectory()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -309,7 +311,7 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
-            fluid.save(main_program, "./test_1")
+            fluid.save(main_program, os.path.join(temp_dir.name, "test_1"))
 
             # set var to zero
             for var in main_program.list_vars():
@@ -322,7 +324,8 @@ def test_ptb_rnn_cpu_float32(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            fluid.load(main_program, "./test_1.pdparams", exe)
+            fluid.load(main_program,
+                       os.path.join(temp_dir.name, "test_1.pdparams"), exe)
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -330,6 +333,7 @@ def test_ptb_rnn_cpu_float32(self):
                         var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+            temp_dir.cleanup()
 
 
 class TestSaveLoadPartial(unittest.TestCase):
@@ -347,6 +351,7 @@ def test_ptb_rnn_cpu_float32(self):
         init_scale = 0.1
         batch_size = 4
         batch_num = 200
+        temp_dir = tempfile.TemporaryDirectory()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -424,7 +429,7 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
-            fluid.save(main_program, "./test_1")
+            fluid.save(main_program, os.path.join(temp_dir.name, "test_1"))
 
             # set var to zero
             for var in main_program.list_vars():
@@ -437,7 +442,8 @@ def test_ptb_rnn_cpu_float32(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            fluid.load(test_program, "./test_1.pdopt", None)
+            fluid.load(test_program, os.path.join(temp_dir.name,
+                                                  "test_1.pdopt"), None)
 
             for var in test_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -445,7 +451,9 @@ def test_ptb_rnn_cpu_float32(self):
                         var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
-            fluid.load(test_program, "./test_1.pdmodel", None)
+            fluid.load(test_program,
+                       os.path.join(temp_dir.name, "test_1.pdmodel"), None)
+            temp_dir.cleanup()
 
 
 class TestSaveLoadSetStateDict(unittest.TestCase):
@@ -463,6 +471,7 @@ def test_ptb_rnn_cpu_float32(self):
         init_scale = 0.1
         batch_size = 4
         batch_num = 200
+        temp_dir = tempfile.TemporaryDirectory()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -532,7 +541,7 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
-            fluid.save(main_program, "./test_1")
+            fluid.save(main_program, os.path.join(temp_dir.name, "test_1"))
 
             # set var to zero
             for var in main_program.list_vars():
@@ -545,7 +554,7 @@ def test_ptb_rnn_cpu_float32(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            fluid.load(main_program, "./test_1", exe)
+            fluid.load(main_program, os.path.join(temp_dir.name, "test_1"), exe)
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -553,6 +562,7 @@ def test_ptb_rnn_cpu_float32(self):
                         var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+            temp_dir.cleanup()
 
 
 class TestProgramStatePartial(unittest.TestCase):
@@ -570,6 +580,7 @@ def test_ptb_rnn_cpu_float32(self):
         init_scale = 0.1
         batch_size = 4
         batch_num = 200
+        temp_dir = tempfile.TemporaryDirectory()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -647,7 +658,7 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
-            fluid.save(main_program, os.path.join('some_dir', 'test_1'))
+            fluid.save(main_program, os.path.join(temp_dir.name, 'test_1'))
 
             # set var to zero
             for var in main_program.list_vars():
@@ -662,16 +673,16 @@ def test_ptb_rnn_cpu_float32(self):
 
             #fluid.load(test_program, "./test_1", None )
             program_state = fluid.load_program_state(
-                os.path.join('some_dir', 'test_1'))
+                os.path.join(temp_dir.name, 'test_1'))
 
             program_state_1 = fluid.load_program_state(
-                os.path.join('some_dir', 'test_1.pdparams'))
+                os.path.join(temp_dir.name, 'test_1.pdparams'))
 
             program_state_2 = fluid.load_program_state(
-                os.path.join('some_dir', 'test_1.pdopt'))
+                os.path.join(temp_dir.name, 'test_1.pdopt'))
 
             program_state_3 = fluid.load_program_state(
-                os.path.join('some_dir', 'test_1.pdmodel'))
+                os.path.join(temp_dir.name, 'test_1.pdmodel'))
 
             fluid.set_program_state(test_program, program_state)
 
@@ -741,6 +752,7 @@ def test_ptb_rnn_cpu_float32(self):
                         var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+            temp_dir.cleanup()
 
 
 class TestVariableInit(unittest.TestCase):
@@ -759,7 +771,9 @@ def test_variable_init(self):
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
-        fluid.save(fluid.default_main_program(), "./test_path")
+        temp_dir = tempfile.TemporaryDirectory()
+        fluid.save(fluid.default_main_program(),
+                   os.path.join(temp_dir.name, "test_path"))
 
         def set_var(var, ndarray):
             t = var.get_tensor()
@@ -785,7 +799,7 @@ def set_var(var, ndarray):
 
         fluid.core._create_loaded_parameter(parameter_list, new_scope,
                                             exe._default_executor)
-        parameter_file_name = "./test_path.pdparams"
+        parameter_file_name = os.path.join(temp_dir.name, "test_path.pdparams")
         with open(parameter_file_name, 'rb') as f:
             load_dict = pickle.load(f)
 
@@ -801,7 +815,7 @@ def set_var(var, ndarray):
 
         fluid.core._create_loaded_parameter(opt_list, new_scope,
                                             exe._default_executor)
-        opt_file_name = "./test_path.pdopt"
+        opt_file_name = os.path.join(temp_dir.name, "test_path.pdopt")
         with open(opt_file_name, 'rb') as f:
             load_dict = pickle.load(f)
 
@@ -827,6 +841,7 @@ def set_var(var, ndarray):
                 base_t = base_map[var.name]
 
                 self.assertTrue(np.array_equal(new_t, base_t))
+        temp_dir.cleanup()
 
 
 class TestLoadFromOldInterface(unittest.TestCase):
@@ -838,10 +853,15 @@ def setUp(self):
         if os.path.exists("test_static_load_var_list.pdparams"):
             os.remove("test_static_load_var_list.pdparams")
 
+        self.temp_dir = tempfile.TemporaryDirectory()
+
     def set_place(self):
         return fluid.CPUPlace(
         ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -923,7 +943,9 @@ def test_load_from_old_interface(self):
                     base_map[var.name] = t
 
             #fluid.save(main_program, "./test_1")
-            fluid.io.save_persistables(exe, "test_path", main_program)
+            fluid.io.save_persistables(
+                exe, os.path.join(self.temp_dir.name, "test_path"),
+                main_program)
 
             # set var to zero
             for var in main_program.list_vars():
@@ -936,7 +958,8 @@ def test_load_from_old_interface(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            fluid.load(main_program, "test_path", exe)
+            fluid.load(main_program,
+                       os.path.join(self.temp_dir.name, "test_path"), exe)
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -953,11 +976,13 @@ def test_load_from_old_interface(self):
 
                     var.desc.set_shape(new_shape)
             with self.assertRaises(RuntimeError):
-                fluid.load(main_program, "test_path", exe)
+                fluid.load(main_program,
+                           os.path.join(self.temp_dir.name, "test_path"), exe)
 
             # check unused parameter
 
-            fluid.load(test_clone_program, "test_path", exe)
+            fluid.load(test_clone_program,
+                       os.path.join(self.temp_dir.name, "test_path"), exe)
 
     def test_load_from_old_interface_var_list(self):
         seed = 90
@@ -1040,8 +1065,10 @@ def test_load_from_old_interface_var_list(self):
                     base_map[var.name] = t
 
             #fluid.save(main_program, "./test_1")
-            fluid.io.save_persistables(exe, "test_static_load_var_list",
-                                       main_program)
+            fluid.io.save_persistables(
+                exe,
+                os.path.join(self.temp_dir.name, "test_static_load_var_list"),
+                main_program)
 
             # set var to zero
             var_list = []
@@ -1057,7 +1084,10 @@ def test_load_from_old_interface_var_list(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            fluid.load(main_program, "test_static_load_var_list", exe, var_list)
+            fluid.load(
+                main_program,
+                os.path.join(self.temp_dir.name, "test_static_load_var_list"),
+                exe, var_list)
             var_list_names = [var.name for var in var_list]
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -1087,6 +1117,7 @@ def test_load_from_old_interface(self):
         init_scale = 0.1
         batch_size = 4
         batch_num = 200
+        temp_dir = tempfile.TemporaryDirectory()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -1155,10 +1186,10 @@ def test_load_from_old_interface(self):
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
-
+            save_dir = os.path.join(temp_dir.name, "test_path")
             #fluid.save(main_program, "./test_1")
             fluid.io.save_persistables(exe,
-                                       "test_path",
+                                       save_dir,
                                        main_program,
                                        filename="model_single")
 
@@ -1173,7 +1204,7 @@ def test_load_from_old_interface(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            file_model_path = os.path.join("test_path", "model_single")
+            file_model_path = os.path.join(save_dir, "model_single")
             fluid.load(main_program, file_model_path, exe,
                        fluid.io.get_program_persistable_vars(main_program))
 
@@ -1223,12 +1254,17 @@ def test_load_from_old_interface(self):
                 all_var_list = list(main_program.list_vars())
                 fluid.load(main_program, file_model_path, exe,
                            all_var_list + [temp_var])
+        temp_dir.cleanup()
 
 
 class TestProgramStateOldSave(unittest.TestCase):
 
     def setUp(self):
         self.test_dygraph = True
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def set_place(self):
         return fluid.CPUPlace(
@@ -1319,8 +1355,8 @@ def test_ptb_rnn_cpu_float32(self):
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
-
-            fluid.io.save_persistables(exe, "test_program_1", main_program)
+            save_dir = os.path.join(self.temp_dir.name, "test_program_1")
+            fluid.io.save_persistables(exe, save_dir, main_program)
 
             # set var to zero
             for var in main_program.list_vars():
@@ -1334,7 +1370,7 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
             # case 1: load basic
-            program_state = fluid.load_program_state("test_program_1")
+            program_state = fluid.load_program_state(save_dir)
             fluid.set_program_state(main_program, program_state)
             self.check_in_static(main_program, base_map)
 
@@ -1349,24 +1385,20 @@ def symlink_force(target, link_name):
                     else:
                         raise e
 
-            orig_filepath = './test_program_1/fc_0.w_0'
-            symlink_filepath = './test_program_1/link_fc_0.w_0'
-            # create a needless link file for coverage
-            symlink_force(orig_filepath, symlink_filepath)
-            program_state = fluid.load_program_state("test_program_1")
+            program_state = fluid.load_program_state(save_dir)
             fluid.set_program_state(main_program, program_state)
             self.check_in_static(main_program, base_map)
 
             # case 3: load with var_list
             program_state = fluid.load_program_state(
-                "test_program_1", main_program.all_parameters())
+                save_dir, main_program.all_parameters())
             fluid.set_program_state(main_program, program_state)
             self.check_in_static(main_program, base_map)
 
         if self.test_dygraph:
             # make sure `load_program_state` can be used in dynamic graph mode
             with fluid.dygraph.guard(place):
-                load_state = fluid.load_program_state("test_program_1")
+                load_state = fluid.load_program_state(save_dir)
                 for k, v in load_state.items():
                     self.assertTrue(np.array_equal(base_map[k], v))
 
@@ -1402,6 +1434,7 @@ def test_ptb_rnn_cpu_float32(self):
         init_scale = 0.1
         batch_size = 4
         batch_num = 200
+        temp_dir = tempfile.TemporaryDirectory()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -1479,8 +1512,9 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
+            save_dir = os.path.join(temp_dir.name, "test_program_2")
             fluid.io.save_persistables(exe,
-                                       "test_program_2",
+                                       save_dir,
                                        main_program,
                                        filename="model_1")
 
@@ -1497,7 +1531,7 @@ def test_ptb_rnn_cpu_float32(self):
 
             #fluid.load(test_program, "./test_1", None )
             program_state = fluid.load_program_state(
-                os.path.join("test_program_2", "model_1"),
+                os.path.join(save_dir, "model_1"),
                 var_list=fluid.io.get_program_persistable_vars(main_program))
             fluid.set_program_state(main_program, program_state)
 
@@ -1509,21 +1543,20 @@ def test_ptb_rnn_cpu_float32(self):
                     self.assertTrue(np.array_equal(new_t, base_t))
 
             with self.assertRaises(ValueError):
-                fluid.load_program_state(
-                    os.path.join("test_program_2", "model_1"))
+                fluid.load_program_state(os.path.join(save_dir, "model_1"))
 
             with self.assertRaises(TypeError):
-                fluid.load_program_state(os.path.join("test_program_2",
-                                                      "model_1"),
+                fluid.load_program_state(os.path.join(save_dir, "model_1"),
                                          var_list=["str"])
 
             with self.assertRaises(RuntimeError):
                 fluid.load_program_state(
-                    os.path.join("test_program_2", "model_1"),
+                    os.path.join(save_dir, "model_1"),
                     var_list=[
                         main_program.global_block().create_var(
                             name="fake_var_name", persistable=True)
                     ])
+        temp_dir.cleanup()
 
 
 class TestStaticSaveLoadPickle(unittest.TestCase):
@@ -1552,7 +1585,8 @@ def test_pickle_protocol(self):
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
 
-            path = os.path.join("test_static_save_load_pickle",
+            temp_dir = tempfile.TemporaryDirectory()
+            path = os.path.join(temp_dir.name, "test_static_save_load_pickle",
                                 "pickle_protocol")
 
             with self.assertRaises(ValueError):
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py
index 25619aa4a5c04..6da849a44bdf4 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py
@@ -23,12 +23,20 @@
 from paddle.fluid.tests.unittests.test_imperative_base import new_program_scope
 from paddle.fluid.tests.unittests.test_static_save_load import PtbModel
 import numpy as np
+import tempfile
+import os
 
 
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
 class TestSaveLoadBF16(unittest.TestCase):
 
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
     def set_place(self):
         return fluid.CPUPlace()
 
@@ -111,8 +119,8 @@ def test_ptb_rnn_cpu_bfloat16(self):
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
-
-            fluid.save(main_program, "./test_1")
+            save_dir = os.path.join(self.temp_dir.name, "test_1")
+            fluid.save(main_program, save_dir)
 
             # set var to zero
             for var in main_program.list_vars():
@@ -125,7 +133,8 @@ def test_ptb_rnn_cpu_bfloat16(self):
                     # make sure all the paramerter or optimizer var have been set to zero
                     self.assertTrue(np.sum(np.abs(new_t)) == 0)
 
-            fluid.load(main_program, "./test_1.pdparams", exe)
+            fluid.load(main_program,
+                       os.path.join(self.temp_dir.name, "test_1.pdparams"), exe)
 
             for var in main_program.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
index fdb6a1f2f0585..e45cd59b444b9 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
@@ -23,6 +23,7 @@
 import numpy as np
 import pickle
 import os
+import tempfile
 
 LARGE_PARAM = 2**26
 
@@ -51,9 +52,10 @@ def test_large_parameters_static_save(self):
                     # make sure all the paramerter or optimizer var have been update
                     self.assertTrue(np.sum(np.abs(t)) != 0)
                     base_map[var.name] = t
-
-            path = os.path.join("test_static_save_load_large_param",
-                                "static_save")
+            temp_dir = tempfile.TemporaryDirectory()
+            path = os.path.join(temp_dir.name,
+                                "test_static_save_load_large_param")
+            path = os.path.join(path, "static_save")
             protocol = 4
             paddle.fluid.save(prog, path, pickle_protocol=protocol)
             # set var to zero
@@ -93,6 +95,7 @@ def test_large_parameters_static_save(self):
                         var.name).get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+            temp_dir.cleanup()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_triplet_margin_with_distance_loss.py b/python/paddle/fluid/tests/unittests/test_triplet_margin_with_distance_loss.py
new file mode 100644
index 0000000000000..0fb8ae22c26a6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_triplet_margin_with_distance_loss.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+
+
+def call_TripletMarginDistanceLoss_layer(
+    input,
+    positive,
+    negative,
+    distance_function=None,
+    margin=0.3,
+    swap=False,
+    reduction='mean',
+):
+    triplet_margin_with_distance_loss = paddle.nn.TripletMarginWithDistanceLoss(
+        distance_function=distance_function,
+        margin=margin,
+        swap=swap,
+        reduction=reduction)
+    res = triplet_margin_with_distance_loss(
+        input=input,
+        positive=positive,
+        negative=negative,
+    )
+    return res
+
+
+def call_TripletMaginDistanceLoss_functional(
+    input,
+    positive,
+    negative,
+    distance_function=None,
+    margin=0.3,
+    swap=False,
+    reduction='mean',
+):
+    res = paddle.nn.functional.triplet_margin_with_distance_loss(
+        input=input,
+        positive=positive,
+        negative=negative,
+        distance_function=distance_function,
+        margin=margin,
+        swap=swap,
+        reduction=reduction)
+    return res
+
+
+def test_static(place,
+                input_np,
+                positive_np,
+                negative_np,
+                distance_function=None,
+                margin=0.3,
+                swap=False,
+                reduction='mean',
+                functional=False):
+    prog = paddle.static.Program()
+    startup_prog = paddle.static.Program()
+    with paddle.static.program_guard(prog, startup_prog):
+        input = paddle.static.data(name='input',
+                                   shape=input_np.shape,
+                                   dtype='float64')
+        positive = paddle.static.data(name='positive',
+                                      shape=positive_np.shape,
+                                      dtype='float64')
+        negative = paddle.static.data(name='negative',
+                                      shape=negative_np.shape,
+                                      dtype='float64')
+        feed_dict = {
+            "input": input_np,
+            "positive": positive_np,
+            "negative": negative_np
+        }
+
+        if functional:
+            res = call_TripletMaginDistanceLoss_functional(
+                input=input,
+                positive=positive,
+                negative=negative,
+                distance_function=distance_function,
+                margin=margin,
+                swap=swap,
+                reduction=reduction)
+        else:
+            res = call_TripletMarginDistanceLoss_layer(
+                input=input,
+                positive=positive,
+                negative=negative,
+                distance_function=distance_function,
+                margin=margin,
+                swap=swap,
+                reduction=reduction)
+
+        exe = paddle.static.Executor(place)
+        static_result = exe.run(prog, feed=feed_dict, fetch_list=[res])
+
+    return static_result
+
+
+def test_dygraph(place,
+                 input,
+                 positive,
+                 negative,
+                 distance_function=None,
+                 margin=0.3,
+                 swap=False,
+                 reduction='mean',
+                 functional=False):
+    paddle.disable_static()
+    input = paddle.to_tensor(input)
+    positive = paddle.to_tensor(positive)
+    negative = paddle.to_tensor(negative)
+
+    if functional:
+        dy_res = call_TripletMaginDistanceLoss_functional(
+            input=input,
+            positive=positive,
+            negative=negative,
+            distance_function=distance_function,
+            margin=margin,
+            swap=swap,
+            reduction=reduction)
+    else:
+        dy_res = call_TripletMarginDistanceLoss_layer(
+            input=input,
+            positive=positive,
+            negative=negative,
+            distance_function=distance_function,
+            margin=margin,
+            swap=swap,
+            reduction=reduction)
+    dy_result = dy_res.numpy()
+    paddle.enable_static()
+    return dy_result
+
+
+def calc_triplet_margin_distance_loss(
+    input,
+    positive,
+    negative,
+    distance_function=None,
+    margin=0.3,
+    swap=False,
+    reduction='mean',
+):
+    distance_function = np.linalg.norm
+    positive_dist = distance_function((input - positive), 2, axis=1)
+    negative_dist = distance_function((input - negative), 2, axis=1)
+
+    if swap:
+        swap_dist = np.linalg.norm((positive - negative), 2, axis=1)
+        negative_dist = np.minimum(negative_dist, swap_dist)
+    expected = np.maximum(positive_dist - negative_dist + margin, 0)
+
+    if reduction == 'mean':
+        expected = np.mean(expected)
+    elif reduction == 'sum':
+        expected = np.sum(expected)
+    else:
+        expected = expected
+
+    return expected
+
+
+class TestTripletMarginWithDistanceLoss(unittest.TestCase):
+
+    def test_TripletMarginDistanceLoss(self):
+        shape = (5, 5)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+
+        places = [paddle.CPUPlace()]
+        if paddle.device.is_compiled_with_cuda():
+            places.append(paddle.CUDAPlace(0))
+        reductions = ['sum', 'mean', 'none']
+        for place in places:
+            for reduction in reductions:
+                expected = calc_triplet_margin_distance_loss(
+                    input=input,
+                    positive=positive,
+                    negative=negative,
+                    reduction=reduction)
+
+                dy_result = test_dygraph(
+                    place=place,
+                    input=input,
+                    positive=positive,
+                    negative=negative,
+                    reduction=reduction,
+                )
+
+                static_result = test_static(
+                    place=place,
+                    input_np=input,
+                    positive_np=positive,
+                    negative_np=negative,
+                    reduction=reduction,
+                )
+                self.assertTrue(np.allclose(static_result, expected))
+                self.assertTrue(np.allclose(static_result, dy_result))
+                self.assertTrue(np.allclose(dy_result, expected))
+                static_functional = test_static(place=place,
+                                                input_np=input,
+                                                positive_np=positive,
+                                                negative_np=negative,
+                                                reduction=reduction,
+                                                functional=True)
+                dy_functional = test_dygraph(place=place,
+                                             input=input,
+                                             positive=positive,
+                                             negative=negative,
+                                             reduction=reduction,
+                                             functional=True)
+                self.assertTrue(np.allclose(static_functional, expected))
+                self.assertTrue(np.allclose(static_functional, dy_functional))
+                self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_TripletMarginDistanceLoss_error(self):
+        paddle.disable_static()
+        self.assertRaises(ValueError,
+                          paddle.nn.TripletMarginWithDistanceLoss,
+                          reduction="unsupport reduction")
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        positive = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        negative = paddle.to_tensor([[0.2, 0.1]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.triplet_margin_with_distance_loss,
+            input=input,
+            positive=positive,
+            negative=negative,
+            reduction="unsupport reduction")
+        paddle.enable_static()
+
+    def test_TripletMarginDistanceLoss_distance_function(self):
+
+        def distance_function_1(x1, x2):
+            return 1.0 - paddle.nn.functional.cosine_similarity(x1, x2)
+
+        def distance_function_2(x1, x2):
+            return paddle.max(paddle.abs(x1 - x2), axis=1)
+
+        distance_function_list = [distance_function_1, distance_function_2]
+        shape = (5, 5)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+
+        place = paddle.CPUPlace()
+        reduction = 'mean'
+        for distance_function in distance_function_list:
+            dy_result = test_dygraph(
+                place=place,
+                input=input,
+                positive=positive,
+                negative=negative,
+                distance_function=distance_function,
+                reduction=reduction,
+            )
+
+            static_result = test_static(
+                place=place,
+                input_np=input,
+                positive_np=positive,
+                negative_np=negative,
+                distance_function=distance_function,
+                reduction=reduction,
+            )
+            self.assertTrue(np.allclose(static_result, dy_result))
+            static_functional = test_static(place=place,
+                                            input_np=input,
+                                            positive_np=positive,
+                                            negative_np=negative,
+                                            distance_function=distance_function,
+                                            reduction=reduction,
+                                            functional=True)
+            dy_functional = test_dygraph(place=place,
+                                         input=input,
+                                         positive=positive,
+                                         negative=negative,
+                                         distance_function=distance_function,
+                                         reduction=reduction,
+                                         functional=True)
+            self.assertTrue(np.allclose(static_functional, dy_functional))
+
+    def test_TripletMarginWithDistanceLoss_distance_funtion_error(self):
+        paddle.disable_static()
+
+        def distance_function(x1, x2):
+            return -1.0 - paddle.nn.functional.cosine_similarity(x1, x2)
+
+        func = distance_function
+        shape = (5, 5)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.triplet_margin_with_distance_loss,
+            input=input,
+            positive=positive,
+            negative=negative,
+            distance_function=func,
+        )
+        paddle.enable_static()
+
+    def test_TripletMarginDistanceLoss_dimension(self):
+        paddle.disable_static()
+
+        input = paddle.to_tensor([[0.1, 0.3], [1, 2]], dtype='float32')
+        positive = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        negative = paddle.to_tensor([[0.2, 0.1]], dtype='float32')
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.triplet_margin_with_distance_loss,
+            input=input,
+            positive=positive,
+            negative=negative,
+        )
+        triplet_margin_with_distance_loss = paddle.nn.loss.TripletMarginWithDistanceLoss(
+        )
+        self.assertRaises(
+            ValueError,
+            triplet_margin_with_distance_loss,
+            input=input,
+            positive=positive,
+            negative=negative,
+        )
+        paddle.enable_static()
+
+    def test_TripletMarginWithDistanceLoss_swap(self):
+        reduction = 'mean'
+        place = paddle.CPUPlace()
+        shape = (5, 5)
+        input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
+        positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
+        expected = calc_triplet_margin_distance_loss(input=input,
+                                                     swap=True,
+                                                     positive=positive,
+                                                     negative=negative,
+                                                     reduction=reduction)
+
+        dy_result = test_dygraph(
+            place=place,
+            swap=True,
+            input=input,
+            positive=positive,
+            negative=negative,
+            reduction=reduction,
+        )
+
+        static_result = test_static(
+            place=place,
+            swap=True,
+            input_np=input,
+            positive_np=positive,
+            negative_np=negative,
+            reduction=reduction,
+        )
+        self.assertTrue(np.allclose(static_result, expected))
+        self.assertTrue(np.allclose(static_result, dy_result))
+        self.assertTrue(np.allclose(dy_result, expected))
+        static_functional = test_static(place=place,
+                                        swap=True,
+                                        input_np=input,
+                                        positive_np=positive,
+                                        negative_np=negative,
+                                        reduction=reduction,
+                                        functional=True)
+        dy_functional = test_dygraph(place=place,
+                                     swap=True,
+                                     input=input,
+                                     positive=positive,
+                                     negative=negative,
+                                     reduction=reduction,
+                                     functional=True)
+        self.assertTrue(np.allclose(static_functional, expected))
+        self.assertTrue(np.allclose(static_functional, dy_functional))
+        self.assertTrue(np.allclose(dy_functional, expected))
+
+    def test_TripletMarginWithDistanceLoss_margin(self):
+        paddle.disable_static()
+
+        input = paddle.to_tensor([[0.1, 0.3]], dtype='float32')
+        positive = paddle.to_tensor([[0.0, 1.0]], dtype='float32')
+        negative = paddle.to_tensor([[0.2, 0.1]], dtype='float32')
+        margin = -0.5
+        self.assertRaises(
+            ValueError,
+            paddle.nn.functional.triplet_margin_with_distance_loss,
+            margin=margin,
+            input=input,
+            positive=positive,
+            negative=negative,
+        )
+        paddle.enable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 54c800c875444..e66f310eb977d 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1776,7 +1776,7 @@ def test_eager_tensor_grad_name_value(self):
             b = a**2
             self.assertEqual(a._grad_value(), None)
             b.backward()
-            self.assertEqual('eager_tmp' in a._grad_name(), True)
+            self.assertEqual('eager_in_tmp' in a._grad_name(), True)
             self.assertNotEqual(a._grad_value(), None)
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
index 3ef4701cdf3d0..e6bc61b895abb 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
@@ -19,84 +19,126 @@
 import unittest
 import numpy as np
 from op_test_xpu import XPUOpTest
-from op_test import OpTest, skip_check_grad_ci
-import paddle.fluid as fluid
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
 paddle.enable_static()
 
 
-class TestCheckFiniteAndUnscaleOp(XPUOpTest):
-
-    def setUp(self):
-        self.op_type = "check_finite_and_unscale"
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        scale = np.random.random((1)).astype(self.dtype)
-        # self.attrs = {'stop_gradient': True}
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([0]),
-            'Out': [('out0', x / scale)],
-        }
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-
-# class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest):
-#     def setUp(self):
-#         self.op_type = "check_finite_and_unscale"
-#         self.init_dtype()
-#         x = np.random.random((1024, 1024)).astype(self.dtype)
-#         x[128][128] = np.nan
-#         print("x shape = ", x.shape)
-#         print(x)
-#         scale = np.random.random((1)).astype(self.dtype)
-
-#         self.inputs = {'X': [('x0', x)], 'Scale': scale}
-#         self.outputs = {
-#             'FoundInfinite': np.array([1]),
-#             'Out': [('out0', x)],
-#         }
-
-#     def init_dtype(self):
-#         self.dtype = np.float32
-
-#     def test_check_output(self):
-#         # When input contains nan, do not check the output,
-#         # since the output may be nondeterministic and will be discarded.
-#         if paddle.is_compiled_with_xpu():
-#             place = paddle.XPUPlace(0)
-#             self.check_output_with_place(place, no_check_set=['Out'])
-
-# class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest):
-#     def setUp(self):
-#         self.op_type = "check_finite_and_unscale"
-#         self.init_dtype()
-#         x = np.random.random((1024, 1024)).astype(self.dtype)
-#         x[128][128] = np.inf
-#         scale = np.random.random((1)).astype(self.dtype)
-
-#         self.inputs = {'X': [('x0', x)], 'Scale': scale}
-#         self.outputs = {
-#             'FoundInfinite': np.array([1]),
-#             'Out': [('out0', x)],
-#         }
-
-#     def init_dtype(self):
-#         self.dtype = np.float32
-
-#     def test_check_output(self):
-#         # When input contains inf, do not check the output,
-#         # since the output may be nondeterministic and will be discarded.
-#         if paddle.is_compiled_with_xpu():
-#             place = paddle.XPUPlace(0)
-#             self.check_output_with_place(place, no_check_set=['Out'])
+class XPUTestCheckFiniteAndUnscaleOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'check_finite_and_unscale'
+        self.use_dynamic_create_class = False
+
+    class TestCheckFiniteAndUnscaleOpNormal(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "check_finite_and_unscale"
+            self.init_dtype()
+            x = np.random.random((8, 8)).astype(self.dtype)
+            scale = np.random.random((1)).astype(np.float32)
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
+            self.outputs = {
+                'FoundInfinite': np.array([0]),
+                'Out': [('out0', x / scale)],
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "check_finite_and_unscale"
+            self.init_dtype()
+            x = np.random.random((256, 256)).astype(self.dtype)
+            idx1 = np.random.randint(255)
+            idx2 = np.random.randint(255)
+            x[idx1][idx2] = np.nan
+            x[idx2][idx1] = np.nan
+            scale = np.random.random((1)).astype(np.float32)
+
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
+            self.outputs = {
+                'FoundInfinite': np.array([1]),
+                'Out': [('out0', x)],
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            # When input contains nan, do not check the output,
+            # since the output may be nondeterministic and will be discarded.
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['Out'])
+
+    class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "check_finite_and_unscale"
+            self.init_dtype()
+            x = np.random.random((256, 256)).astype(self.dtype)
+            idx1 = np.random.randint(255)
+            idx2 = np.random.randint(255)
+            x[idx1][idx2] = np.nan
+            x[idx2][idx1] = np.nan
+            scale = np.random.random((1)).astype(np.float32)
+            myscale = np.array([0.05]).astype(self.dtype)
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
+            self.outputs = {
+                'FoundInfinite': np.array([1]),
+                'Out': [('out0', x)],
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            # When input contains inf, do not check the output,
+            # since the output may be nondeterministic and will be discarded.
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['Out'])
+
+    class TestCheckFiniteAndUnscaleOpWithInfAndNan(XPUOpTest):
+
+        def setUp(self):
+            self.op_type = "check_finite_and_unscale"
+            self.init_dtype()
+            x = np.random.random((256, 256)).astype(self.dtype)
+            idx1 = np.random.randint(255)
+            idx2 = np.random.randint(255)
+            x[idx1][idx2] = np.inf
+            x[idx2][idx1] = np.nan
+            scale = np.random.random((1)).astype(np.float32)
+            myscale = np.array([0.05]).astype(self.dtype)
+            self.inputs = {'X': [('x0', x)], 'Scale': scale}
+            self.outputs = {
+                'FoundInfinite': np.array([1]),
+                'Out': [('out0', x)],
+            }
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            # When input contains inf, do not check the output,
+            # since the output may be nondeterministic and will be discarded.
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['Out'])
+
+
+support_types = get_xpu_op_support_types('check_finite_and_unscale')
+for stype in support_types:
+    create_test_class(globals(), XPUTestCheckFiniteAndUnscaleOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 3029c3a294a00..d283eae392fb3 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from paddle.fluid import framework, core, layers, unique_name
 from paddle.fluid.framework import Variable
 from paddle.fluid.clip import ClipGradByGlobalNorm
@@ -19,11 +20,69 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.optimizer import Optimizer
 from paddle.distributed import get_rank, get_world_size
+from paddle.distributed.collective import new_group
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import name_scope
+from paddle.fluid import core, unique_name
 import numpy as np
 
 
+def init_communicator(block, rank, ranks, ring_id):
+    eps = os.environ['PADDLE_TRAINER_ENDPOINTS']
+    eps = [ep.strip() for ep in eps.split(",") if ep.strip()]
+    cur_ep = eps[rank]
+    other_eps = [eps[r] for r in ranks if r != rank]
+
+    local_rank = ranks.index(rank)
+    comm_var_name = unique_name.generate('comm_id')
+    comm_id_var = block.create_var(name=comm_var_name,
+                                   persistable=True,
+                                   type=core.VarDesc.VarType.RAW)
+    block.append_op(type='c_gen_nccl_id',
+                    inputs={},
+                    outputs={'Out': comm_id_var},
+                    attrs={
+                        'rank': local_rank,
+                        'endpoint': cur_ep,
+                        'other_endpoints': other_eps,
+                        'ring_id': ring_id
+                    })
+    block.append_op(type='c_comm_init',
+                    inputs={'X': comm_id_var},
+                    outputs={},
+                    attrs={
+                        'nranks': len(ranks),
+                        'rank': local_rank,
+                        'ring_id': ring_id
+                    })
+    tmp_var = block.create_var(name=unique_name.generate('tmp'))
+    block.append_op(type='fill_constant',
+                    outputs={'Out': tmp_var},
+                    attrs={'value': 1})
+    block.append_op(type='c_allreduce_sum',
+                    inputs={'X': tmp_var},
+                    outputs={'Out': tmp_var},
+                    attrs={
+                        'ring_id': ring_id,
+                        'use_calc_stream': True
+                    })
+    block.append_op(type='c_sync_calc_stream',
+                    inputs={'X': tmp_var},
+                    outputs={'Out': tmp_var})
+    return ring_id
+
+
+def broadcast_parameters(block, parameters, ring_id):
+    for p in parameters:
+        block.append_op(type='c_broadcast',
+                        inputs={'X': p},
+                        outputs={'Out': p},
+                        attrs={
+                            'ring_id': ring_id,
+                            'use_calc_stream': True
+                        })
+
+
 class DistributedFusedLamb(Optimizer):
 
     def __init__(self,
@@ -41,6 +100,7 @@ def __init__(self,
                  use_master_param_norm=True,
                  gradient_accumulation_steps=1,
                  use_master_acc_grad=True,
+                 nproc_per_node=None,
                  name=None):
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
@@ -65,10 +125,10 @@ def __init__(self,
         self._is_grad_scaled_by_nranks = is_grad_scaled_by_nranks
         self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
         self._scale = None
-        self._ring_id = 0
         self._use_master_param_norm = use_master_param_norm
         self._gradient_accumulation_steps = gradient_accumulation_steps
         self._use_master_acc_grad = use_master_acc_grad
+        self._nproc_per_node = nproc_per_node
         assert self._gradient_accumulation_steps >= 1
 
         self.helper = LayerHelper('distributed_fused_lamb')
@@ -228,6 +288,30 @@ def _apply_gradients_impl(self, params_grads):
 
         rank = get_rank()
         nranks = get_world_size()
+        if self._nproc_per_node is None:
+            nproc_per_node = nranks
+        else:
+            nproc_per_node = self._nproc_per_node
+        assert nranks % nproc_per_node == 0, "nranks should be exactly divided by nproc_per_node"
+
+        shard_inside_node = (nranks > nproc_per_node)
+        local_rank = rank % nproc_per_node
+        node_id = int(rank / nproc_per_node)
+        node_num = int(nranks / nproc_per_node)
+        ring_ids = []
+        startup_block = self.helper.startup_program.global_block()
+        if nranks > 1:
+            ring_id = init_communicator(startup_block, rank,
+                                        list(range(nranks)), 0)
+            ring_ids.append(ring_id)
+
+        if node_num > 1 and len(ring_ids) <= 1 and shard_inside_node:
+            local_group_ranks = list(
+                range(node_id * nproc_per_node, (node_id + 1) * nproc_per_node))
+            ring_id = init_communicator(startup_block, rank, local_group_ranks,
+                                        1)
+            ring_ids.append(ring_id)
+
         scale = self._get_or_create_scale()
 
         params = [p for p, _ in params_grads]
@@ -238,7 +322,6 @@ def _apply_gradients_impl(self, params_grads):
                 if self._exclude_from_weight_decay_fn(p):
                     apply_weight_decay[i] = 0
 
-        startup_block = self.helper.startup_program.global_block()
         for g in grads:
             startup_block.create_var(name=g.name,
                                      type=g.type,
@@ -246,46 +329,45 @@ def _apply_gradients_impl(self, params_grads):
                                      persistable=g.persistable,
                                      shape=g.shape)
 
-        startup_block.append_op(type='distributed_fused_lamb_init',
-                                inputs={
-                                    'Param': params,
-                                    'Grad': grads,
-                                },
-                                outputs={
-                                    'FP32FusedParam': [fp32_fused_param],
-                                    'FP32FusedGrad': [fp32_fused_grad],
-                                    'FP16FusedParam': [fp16_fused_param],
-                                    'FP16FusedGrad': [fp16_fused_grad],
-                                    'Moment1': [moment1],
-                                    'Moment2': [moment2],
-                                    'Beta1Pow': [beta1pow],
-                                    'Beta2Pow': [beta2pow],
-                                    'GlobalScale': [scale],
-                                    'ParamInfo': [param_info],
-                                    'ParamOut':
-                                    params,
-                                    'MasterParamOut':
-                                    master_params,
-                                    'GradOut':
-                                    grads,
-                                    'FP32ShardFusedParamOffsets':
-                                    [fp32_partial_fused_offsets],
-                                    'FP16ShardFusedParamOffsets':
-                                    [fp16_partial_fused_offsets],
-                                    'FusedParamOffsets': [fused_offsets],
-                                    'ParamOrder': [param_order],
-                                    'Step': [step],
-                                },
-                                attrs={
-                                    'alignment': self._alignment,
-                                    'rank': rank,
-                                    'nranks': nranks,
-                                    'apply_weight_decay': apply_weight_decay,
-                                    'moment1': 0.0,
-                                    'moment2': 0.0,
-                                    'beta1': self._beta1,
-                                    'beta2': self._beta2,
-                                })
+        if nranks > 1:
+            broadcast_parameters(startup_block, params, ring_ids[0])
+
+        startup_block.append_op(
+            type='distributed_fused_lamb_init',
+            inputs={
+                'Param': params,
+                'Grad': grads,
+            },
+            outputs={
+                'FP32FusedParam': [fp32_fused_param],
+                'FP32FusedGrad': [fp32_fused_grad],
+                'FP16FusedParam': [fp16_fused_param],
+                'FP16FusedGrad': [fp16_fused_grad],
+                'Moment1': [moment1],
+                'Moment2': [moment2],
+                'Beta1Pow': [beta1pow],
+                'Beta2Pow': [beta2pow],
+                'GlobalScale': [scale],
+                'ParamInfo': [param_info],
+                'ParamOut': params,
+                'MasterParamOut': master_params,
+                'GradOut': grads,
+                'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
+                'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
+                'FusedParamOffsets': [fused_offsets],
+                'ParamOrder': [param_order],
+                'Step': [step],
+            },
+            attrs={
+                'alignment': self._alignment,
+                'rank': local_rank if shard_inside_node else rank,
+                'nranks': nproc_per_node if shard_inside_node else nranks,
+                'apply_weight_decay': apply_weight_decay,
+                'moment1': 0.0,
+                'moment2': 0.0,
+                'beta1': self._beta1,
+                'beta2': self._beta2,
+            })
 
         main_block = self.helper.main_program.global_block()
         self._create_global_learning_rate()
@@ -351,7 +433,8 @@ def _apply_gradients_impl(self, params_grads):
                 'max_global_grad_norm': self._max_global_grad_norm,
                 'clip_after_allreduce': self._clip_after_allreduce,
                 'rank': rank,
-                'ring_id': self._ring_id,
+                'nranks': nranks,
+                'ring_id': ring_ids,
                 'use_master_param_norm': self._use_master_param_norm,
                 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks,
                 'acc_steps': self._gradient_accumulation_steps,
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 3e71a39e7a694..8b29659a1f400 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -108,6 +108,7 @@
 from .layer.loss import SmoothL1Loss  # noqa: F401
 from .layer.loss import HingeEmbeddingLoss  # noqa: F401
 from .layer.loss import CosineEmbeddingLoss  # noqa: F401
+from .layer.loss import TripletMarginWithDistanceLoss
 from .layer.loss import TripletMarginLoss
 from .layer.norm import BatchNorm  # noqa: F401
 from .layer.norm import SyncBatchNorm  # noqa: F401
@@ -315,5 +316,6 @@ def weight_norm(*args):
     'Identity',
     'CosineEmbeddingLoss',
     'RReLU',
+    'TripletMarginWithDistanceLoss',
     'TripletMarginLoss',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 826537a3d7bdc..cdb1135eba800 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -91,6 +91,7 @@
 from .loss import ctc_loss  # noqa: F401
 from .loss import hinge_embedding_loss  # noqa: F401
 from .loss import cosine_embedding_loss  # noqa: F401
+from .loss import triplet_margin_with_distance_loss
 from .loss import triplet_margin_loss
 from .norm import batch_norm  # noqa: F401
 from .norm import instance_norm  # noqa: F401
@@ -233,5 +234,6 @@
     'fold',
     'cosine_embedding_loss',
     'rrelu',
+    'triplet_margin_with_distance_loss',
     'triplet_margin_loss',
 ]
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index f1d66a9e3a1b5..d8dc68376d163 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -143,7 +143,7 @@ def _conv_nd(x,
     if in_dygraph_mode() and op_type == "depthwise_conv2d":
         pre_bias = _C_ops.final_state_depthwise_conv2d(
             x, weight, stride, padding, padding_algorithm, groups, dilation,
-            data_format, False, -1, False, False)
+            data_format, False, -1, False, False, use_cudnn)
         if bias is not None:
             channel_dim = channel_dim + len(
                 x.shape) if channel_dim < 0 else channel_dim
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 81427928d42ef..2f37f8a50f4d1 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2874,6 +2874,133 @@ def cosine_embedding_loss(input1,
         return paddle.sum(out, name=name)
 
 
+def triplet_margin_with_distance_loss(input,
+                                      positive,
+                                      negative,
+                                      distance_function=None,
+                                      margin=1.0,
+                                      swap=False,
+                                      reduction='mean',
+                                      name=None):
+    r"""
+    Measures the triplet loss given an input
+    tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `input`, `positive` and `negative` (i.e., `input`, `positive examples` and `negative
+    examples` respectively). The shapes of all input tensors should be
+    :math:`(N, D)`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
+
+
+    where the default distance function
+
+    .. math::
+        d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
+
+    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference 
+    between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
+    distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
+
+    Parameters:
+
+        input (Tensor):Input tensor, the data type is float32 or float64.
+            the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+
+        positive (Tensor):Positive tensor, the data type is float32 or float64.
+            The shape of label is the same as the shape of input.
+
+        negative (Tensor):Negative tensor, the data type is float32 or float64.
+            The shape of label is the same as the shape of input.
+
+        distance_function (callable, optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
+	
+	    margin (float, optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
+            between the positive and negative distances required for the loss to be 0.
+	
+        swap (bool, optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
+                and negative samples) if swap distance smaller than negative distance. Default: ``False``.
+
+        reduction (str, optional):Indicate how to average the loss by batch_size.
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+            If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+            Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+	    
+    Returns:
+        Output: Tensor. The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
+            positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
+            negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
+            loss = F.triplet_margin_with_distance_loss(input, positive, negative, margin=1.0, reduction='none')
+            print(loss)
+            # Tensor([0.        , 0.57496738, 0.        ])
+
+
+            loss = F.triplet_margin_with_distance_loss(input, positive, negative, margin=1.0, reduction='mean')
+            print(loss)
+            # Tensor([0.19165580])
+
+    """
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError("'reduction' in 'triplet_margin_with_distance_loss' "
+                         "should be 'sum', 'mean' or 'none', "
+                         "but received {}.".format(reduction))
+    if margin < 0:
+        raise ValueError(
+            "The margin between positive samples and negative samples should be greater than 0."
+        )
+    if not _non_static_mode():
+        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
+                                 'triplet_margin_with_distance_loss')
+        check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
+                                 'triplet_margin_with_distance_loss')
+        check_variable_and_dtype(negative, 'negative', ['float32', 'float64'],
+                                 'triplet_margin_with_distance_loss')
+
+    if not (input.shape == positive.shape == negative.shape):
+        raise ValueError("input's shape must equal to "
+                         "positive's shape and  "
+                         "negative's shape")
+
+    distance_function = distance_function if distance_function is not None \
+        else paddle.nn.PairwiseDistance(2)
+
+    positive_dist = distance_function(input, positive)
+    negative_dist = distance_function(input, negative)
+
+    if swap:
+        swap_dist = distance_function(positive, negative)
+        negative_dist = paddle.minimum(negative_dist, swap_dist)
+
+    if not paddle.all(positive_dist > 0) or not paddle.all(negative_dist > 0):
+        raise ValueError(
+            "The positive distance or negative distance should be greater than 0, "
+            "The distance functions should be checked.")
+
+    loss = paddle.clip(positive_dist - negative_dist + margin, min=0.0)
+
+    if reduction == 'mean':
+        return paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(loss, name=name)
+    elif reduction == 'none':
+        return loss
+
+
 def triplet_margin_loss(input,
                         positive,
                         negative,
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 66f84c3fd4870..e9ccee1bd3829 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -79,6 +79,7 @@
 from .loss import CTCLoss  # noqa: F401
 from .loss import SmoothL1Loss  # noqa: F401
 from .loss import HingeEmbeddingLoss  # noqa: F401
+from .loss import TripletMarginWithDistanceLoss
 from .loss import TripletMarginLoss
 from .norm import BatchNorm1D  # noqa: F401
 from .norm import BatchNorm2D  # noqa: F401
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 2b14cb22228a9..6ef82ccfc4139 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1402,6 +1402,114 @@ def forward(self, input1, input2, label):
                                        name=self.name)
 
 
+class TripletMarginWithDistanceLoss(Layer):
+    r"""
+    Creates a criterion that measures the triplet loss given an input
+    tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
+    This is used for measuring a relative similarity between samples. A triplet
+    is composed by `input`, `positive` and `negative` (i.e., `input`, `positive examples` and `negative
+    examples` respectively). The shapes of all input tensors should be
+    :math:`(N, D)`.
+
+    The loss function for each sample in the mini-batch is:
+
+    .. math::
+        L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
+
+    where the default `distance_function`
+    
+    .. math::
+    	d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
+    
+    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference 
+    between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
+    distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
+
+    Parameters:
+        distance_function (Callable, Optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
+	
+        margin (float, Optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
+                between the positive and negative distances required for the loss to be 0. Larger
+                margins penalize cases where the negative examples are not distant enough from the
+                anchors, relative to the positives.
+		
+        swap (bool, Optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
+                and negative samples) if swap distance smaller than negative distance. Default: ``False``.
+
+        reduction (str, Optional):Indicate how to average the loss by batch_size.
+                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+                If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+                If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+                Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+	    
+    Shapes:
+        input (Tensor):Input tensor, the data type is float32 or float64.
+	the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+
+        positive (Tensor):Positive tensor, the data type is float32 or float64.
+	The shape of label is the same as the shape of input.
+
+        negative (Tensor):Negative tensor, the data type is float32 or float64.
+	The shape of label is the same as the shape of input.
+	
+	    output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
+
+    Return：
+        A callable object of TripletMarginWithDistanceLoss
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.nn import TripletMarginWithDistanceLoss
+
+            input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
+            positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
+            negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
+            triplet_margin_with_distance_loss = TripletMarginWithDistanceLoss(reduction='none')
+            loss = triplet_margin_with_distance_loss(input, positive, negative,)
+            print(loss)
+            # Tensor([0.        , 0.57496738, 0.        ])
+
+            triplet_margin_with_distance_loss = TripletMarginWithDistanceLoss(reduction='mean')
+            loss = triplet_margin_with_distance_loss(input, positive, negative,)
+            print(loss)
+            # Tensor([0.19165580])
+
+    """
+
+    def __init__(self,
+                 distance_function=None,
+                 margin=1.0,
+                 swap=False,
+                 reduction: str='mean',
+                 name=None):
+        super(TripletMarginWithDistanceLoss, self).__init__()
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in TripletMarginWithDistanceLoss "
+                "should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction)
+        self.margin = margin
+        self.swap = swap
+        self.reduction = reduction
+        self.distance_function = distance_function
+        self.name = name
+
+    def forward(self, input, positive, negative):
+        return F.triplet_margin_with_distance_loss(
+            input,
+            positive,
+            negative,
+            margin=self.margin,
+            swap=self.swap,
+            reduction=self.reduction,
+            name=self.name)
+
+
 class TripletMarginLoss(Layer):
     r"""
     Creates a criterion that measures the triplet loss given an input
@@ -1461,17 +1569,18 @@ class TripletMarginLoss(Layer):
         .. code-block:: python
 
             import paddle
+            import paddle.nn.TripletMarginLoss
 
             input = paddle.to_tensor([[1, 5, 3], [0, 3, 2], [1, 4, 1]], dtype=paddle.float32)
             positive= paddle.to_tensor([[5, 1, 2], [3, 2, 1], [3, -1, 1]], dtype=paddle.float32)
             negative = paddle.to_tensor([[2, 1, -3], [1, 1, -1], [4, -2, 1]], dtype=paddle.float32)
-            triplet_margin_loss = paddle.nn.TripletMarginLoss(reduction='none')
-            loss = triplet_margin_loss(input, positive, negative)
+            triplet_margin_loss = TripletMarginLoss()
+            loss = triplet_margin_loss(input, positive, negative, reduction='none')
             print(loss)
             # Tensor([0.        , 0.57496738, 0.        ])
 
-            triplet_margin_loss = paddle.nn.TripletMarginLoss( margin=1.0, swap=True, reduction='mean', )
-            loss = triplet_margin_loss(input, positive, negative,)
+
+            loss = triplet_margin_loss(input, positive, negative, margin=1.0, reduction='mean')
             print(loss)
             # Tensor([0.19165580])
 
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index ec367c7c710ed..4534c39b0082a 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -1032,7 +1032,9 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
         assert regularization_term is not None
 
         if framework.in_dygraph_mode():
-            return _C_ops.final_state_add_n([grad, regularization_term])
+            if grad.is_dense() and regularization_term.is_dense():
+                return _C_ops.final_state_add_n([grad, regularization_term])
+            return _C_ops.sum([grad, regularization_term])
         elif framework._in_legacy_dygraph():
             return _C_ops.sum([grad, regularization_term])
 
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 3ea3ba4982599..08b0af26bd46e 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -139,6 +139,7 @@
 from .math import cosh  # noqa: F401
 from .math import cumsum  # noqa: F401
 from .math import cumprod  # noqa: F401
+from .math import logcumsumexp  # noqa: F401
 from .math import logit  # noqa: F401
 from .math import exp  # noqa: F401
 from .math import exp_  # noqa: F401
@@ -310,6 +311,7 @@
     'cosh',
     'cumsum',
     'cumprod',
+    'logcumsumexp',
     'logit',
     'exp',
     'exp_',
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 2ef324395b26a..ffca233ff16bf 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1377,6 +1377,9 @@ def add_n(inputs, name=None):
     if in_dygraph_mode():
         if isinstance(inputs, Variable):
             inputs = [inputs]
+        for x in inputs:
+            if not x.is_dense():
+                return _C_ops.sum(inputs, 'use_mkldnn', False)
         return _C_ops.final_state_add_n(inputs)
     if _in_legacy_dygraph():
         if isinstance(inputs, Variable):
@@ -2909,7 +2912,7 @@ def cumsum(x, axis=None, dtype=None, name=None):
     The cumulative sum of the elements along a given axis. 
     
     **Note**:
-    The first element of the result is the same of the first element of the input. 
+    The first element of the result is the same as the first element of the input. 
 
     Args:
         x (Tensor): The input tensor needed to be cumsumed.
@@ -2970,6 +2973,79 @@ def cumsum(x, axis=None, dtype=None, name=None):
     _cum_sum_ = generate_layer_fn('cumsum')
     return _cum_sum_(**kwargs)
 
+
+def logcumsumexp(x, axis=None, dtype=None, name=None):
+    r"""
+    The logarithm of the cumulative summation of the exponentiation of the elements along a given axis. 
+
+    For summation index j given by `axis` and other indices i, the result is
+
+    .. math::
+
+        logcumsumexp(x)_{ij} = log \sum_{i=0}^{j}exp(x_{ij})
+    
+    Note:
+        The first element of the result is the same as the first element of the input.
+
+    Args:
+        x (Tensor): The input tensor.
+        axis (int, optional): The dimension to do the operation along. -1 means the last dimension. The default (None) is to compute the cumsum over the flattened array.
+        dtype (str, optional): The data type of the output tensor, can be float32, float64. If specified, the input tensor is casted to dtype before the operation is performed. This is useful for preventing data type overflows. The default value is None. 
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the result of logcumsumexp operator. 
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle
+            
+            data = paddle.arange(12, dtype='float64')
+            data = paddle.reshape(data, (3, 4))
+
+            y = paddle.logcumsumexp(data)
+            # [ 0.         1.3132617  2.4076061  3.4401898  4.4519143  5.4561934
+            #   6.4577627  7.4583397  8.458551   9.45863   10.458658  11.458669 ]
+
+            y = paddle.logcumsumexp(data, axis=0)
+            # [[ 0.        1.        2.        3.      ]
+            #  [ 4.01815   5.01815   6.01815   7.01815 ]
+            #  [ 8.018479  9.018479 10.018479 11.018479]]
+            
+            y = paddle.logcumsumexp(data, axis=-1)
+            # [[ 0.         1.3132617  2.4076061  3.4401898]
+            #  [ 4.         5.3132615  6.407606   7.44019  ]
+            #  [ 8.         9.313262  10.407606  11.440189 ]]
+
+            y = paddle.logcumsumexp(data, dtype='float64')
+            print(y.dtype)
+            # paddle.float64
+    """
+    if axis is None:
+        flatten = True
+    else:
+        flatten = False
+    if dtype is not None and x.dtype != convert_np_dtype_to_dtype_(dtype):
+        x = cast(x, dtype)
+
+    if in_dygraph_mode():
+        if axis is None: axis = -1
+        return _C_ops.final_state_logcumsumexp(x, axis, flatten, False, False)
+    if _in_legacy_dygraph():
+        if axis is None:
+            return _C_ops.logcumsumexp(x, 'flatten', flatten)
+        else:
+            return _C_ops.logcumsumexp(x, 'axis', axis, 'flatten', flatten)
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], "logcumsumexp")
+
+    helper = LayerHelper('logcumsumexp', **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='logcumsumexp', inputs={'X': x}, outputs={'Out': out}, attrs={'axis': axis, 'flatten': flatten})
+    return out
+
+
 def cumprod(x, dim=None, dtype=None, name=None):
     """
     Compute the cumulative product of the input tensor x along a given dimension dim.
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index fd000567c507b..2b8cff3543e76 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -482,7 +482,7 @@
   args : (Tensor x, int axis, bool flatten, bool exclusive, bool reverse)
   output : Tensor(out)
   infer_meta :
-    func : CumsumInferMeta
+    func : CumInferMeta
   kernel :
     func : cumsum
   backward : cumsum_grad
@@ -499,15 +499,16 @@
   backward : deformable_conv_grad
 
 - api : depthwise_conv2d
-  args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
   output : Tensor(out)
-  invoke : conv2d_impl(x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  infer_meta :
+    func : ConvInferMeta
+    param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search]
+  kernel :
+    func : depthwise_conv2d
+    param : [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu]
+    use_gpudnn : use_gpudnn
   backward : depthwise_conv2d_grad
-  # infer_meta :
-  #   func : ConvTransposeInferMeta
-  #   prams: [x, filter, strides, paddings, padding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search]
-  # kernel :
-  #   func : depthwise_conv2d
 
 - api : depthwise_conv2d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -1259,6 +1260,15 @@
     func : log_softmax
   backward : log_softmax_grad
 
+- api : logcumsumexp
+  args : (Tensor x, int axis, bool flatten, bool exclusive, bool reverse)
+  output : Tensor(out)
+  infer_meta :
+    func : CumInferMeta
+  kernel :
+    func : logcumsumexp
+  backward : logcumsumexp_grad
+
 # logical_and
 - api : logical_and
   args : (Tensor x, Tensor y)
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 81641ac19f7b5..8e20b05110e71 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -495,22 +495,27 @@
   optional : mask
 
 - backward_api : depthwise_conv2d_grad
-  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(out)
-  args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
+  forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out)
+  args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
   output : Tensor(input_grad), Tensor(filter_grad)
-  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [input, filter]
+  kernel :
+    func : depthwise_conv2d_grad
+    param : [input, filter, out_grad, strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, fuse_relu]
+    use_gpudnn : use_gpudnn
   backward : depthwise_conv2d_grad_grad
 
 - backward_api : depthwise_conv2d_grad_grad
-  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu) -> Tensor(grad_input), Tensor(grad_filter)
-  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
+  forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter)
+  args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
   output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
     param: [input, filter, grad_out]
   kernel :
-    func : conv2d_grad_grad
-    use_gpudnn : true
+    func : depthwise_conv2d_grad_grad
   optional : grad_input_grad, grad_filter_grad
 
 - backward_api : depthwise_conv2d_transpose_grad
@@ -1137,6 +1142,16 @@
   kernel :
     func : log_softmax_grad
 
+- backward_api : logcumsumexp_grad
+  forward : logcumsumexp(Tensor x, int axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  args : (Tensor x, Tensor out, Tensor out_grad, int axis, bool flatten, bool exclusive, bool reverse)
+  output : Tensor(x_grad)
+  kernel :
+    func : logcumsumexp_grad
+
 - backward_api : logit_grad
   forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float eps)
diff --git a/tools/codestyle/.cmakelintrc b/tools/codestyle/.cmakelintrc
index 6c5fe30276fc6..4e8463cccdca2 100644
--- a/tools/codestyle/.cmakelintrc
+++ b/tools/codestyle/.cmakelintrc
@@ -1 +1 @@
-filter=-readability/wonkycase,-syntax,-convention/filename,-package/stdargs,-whitespace/indent
+filter=-readability/wonkycase,-syntax,-convention/filename,-package/stdargs,-whitespace/indent,-whitespace/extra,-linelength,-readability/mixedcase