diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dfe1e3848ee5a..4e1b2ecf3671a 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,7 +32,7 @@ repos: description: Format files with ClangFormat. entry: bash ./tools/codestyle/clang_format.hook -i language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$ exclude: | (?x)^( paddle/fluid/distributed/ps/thirdparty/round_robin.h @@ -81,64 +81,3 @@ repos: - id: cmakelint args: [--config=./tools/codestyle/.cmakelintrc] # exclude files which need to be fixed - exclude: | - (?x)^( - CMakeLists.txt| - python/paddle/fluid/tests/unittests/CMakeLists.txt| - paddle/fluid/inference/tests/infer_ut/CMakeLists.txt| - cmake/configure.cmake| - paddle/fluid/inference/api/demo_ci/CMakeLists.txt| - cmake/flags.cmake| - cmake/inference_lib.cmake| - cmake/external/protobuf.cmake| - paddle/fluid/framework/fleet/CMakeLists.txt| - paddle/fluid/inference/CMakeLists.txt| - paddle/fluid/inference/tests/api/CMakeLists.txt| - paddle/fluid/operators/CMakeLists.txt| - cmake/external/lite.cmake| - cmake/external/poplar.cmake| - cmake/python_module.cmake| - python/paddle/fluid/tests/unittests/asp/CMakeLists.txt| - cmake/cuda.cmake| - cmake/FindNumPy.cmake| - cmake/coveralls.cmake| - cmake/external/glog.cmake| - cmake/external/onnxruntime.cmake| - cmake/external/openblas.cmake| - cmake/external/xpu.cmake| - cmake/hip.cmake| - paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt| - paddle/fluid/inference/api/CMakeLists.txt| - paddle/fluid/operators/controlflow/CMakeLists.txt| - python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt| - cmake/operators.cmake| - cmake/tensorrt.cmake| - paddle/fluid/inference/api/details/CMakeLists.txt| - python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt| - cmake/external/arm_brpc.cmake| - cmake/external/concurrentqueue.cmake| - cmake/external/eigen.cmake| - cmake/external/mklml.cmake| - cmake/external/paddle2onnx.cmake| - cmake/miopen.cmake| - cmake/nccl.cmake| - cmake/simd.cmake| - paddle/fluid/inference/analysis/CMakeLists.txt| - paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake| - paddle/fluid/memory/allocation/CMakeLists.txt| - paddle/fluid/memory/CMakeLists.txt| - paddle/fluid/operators/cinn/CMakeLists.txt| - paddle/infrt/external_kernels/CMakeLists.txt| - paddle/infrt/kernel/phi/CMakeLists.txt| - python/paddle/fluid/contrib/slim/tests/CMakeLists.txt| - python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt| - python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt| - python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt| - python/paddle/fluid/tests/unittests/fft/CMakeLists.txt| - python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt| - python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt| - python/paddle/fluid/tests/unittests/npu/CMakeLists.txt| - python/paddle/fluid/tests/unittests/ps/CMakeLists.txt| - python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt| - python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt - )$ diff --git a/CMakeLists.txt b/CMakeLists.txt index a3e0b64e97b25..ea4bc8a2d6c3e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,10 +16,10 @@ if(APPLE AND WITH_ARM) # cmake 3.19.2 version starts to support M1 cmake_minimum_required(VERSION 3.19.2) cmake_policy(VERSION 3.19.2) -else(APPLE AND WITH_ARM) +else() cmake_minimum_required(VERSION 3.15) cmake_policy(VERSION 3.10) -endif(APPLE AND WITH_ARM) +endif() # use to get_property location of static lib # https://cmake.org/cmake/help/v3.0/policy/CMP0026.html?highlight=cmp0026 cmake_policy(SET CMP0026 OLD) @@ -152,7 +152,7 @@ if(WIN32) if(${flag_var} MATCHES "/MD") string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") endif() - endforeach(flag_var) + endforeach() endif() # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally @@ -179,10 +179,10 @@ if(WIN32) math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") endif() - endforeach(flag_var) + endforeach() foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") - endforeach(flag_var) + endforeach() # Windows Remove /Zi, /ZI for Release, MinSizeRel builds foreach(flag_var @@ -191,7 +191,7 @@ if(WIN32) if(${flag_var} MATCHES "/Z[iI]") string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}") endif() - endforeach(flag_var) + endforeach() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838" @@ -207,7 +207,7 @@ if(WIN32) if(MSVC_STATIC_CRT) set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") endif() - endforeach(flag_var) + endforeach() if(WITH_WIN_DUMP_DBG) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi") @@ -216,16 +216,16 @@ if(WIN32) foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF") - endforeach(flag_var) + endforeach() add_definitions("-DWITH_WIN_DUMP_DBG") endif() -else(WIN32) +else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations" ) -endif(WIN32) +endif() find_package(Git REQUIRED) @@ -430,7 +430,7 @@ endif() if(WITH_ROCM) include(hip) include(miopen) # set miopen libraries, must before configure -endif(WITH_ROCM) +endif() if(WITH_XPU_KP) include(xpu_kp) diff --git a/README.md b/README.md index 048a273a7d78b..e44378310f726 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md) Welcome to the PaddlePaddle GitHub. PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms. -PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. +PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4.7 million developers, 180,000 companies and generating 560,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. ## Installation @@ -85,7 +85,7 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc. - QQ discussion group: 441226485 (PaddlePaddle). -- [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. +- [Forums](https://aistudio.baidu.com/paddle/forum): discuss implementations, research, etc. ## Courses diff --git a/README_cn.md b/README_cn.md index 3834ee148f940..8f0caab285e07 100644 --- a/README_cn.md +++ b/README_cn.md @@ -15,11 +15,11 @@ 欢迎来到 PaddlePaddle GitHub -飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者406万,服务企业15.7万家,基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 +飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者477万,服务企业18万家,基于飞桨开源深度学习平台产生了56万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 ## 安装 -### PaddlePaddle最新版本: [v2.2](https://github.com/PaddlePaddle/Paddle/tree/release/2.2) +### PaddlePaddle最新版本: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3) 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) @@ -83,7 +83,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议 - QQ群: 441226485 (PaddlePaddle) -- [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 +- [论坛](https://aistudio.baidu.com/paddle/forum): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 ## 课程 diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake index fc7cdb8c1923c..b2d36f0cb01ba 100644 --- a/cmake/FindNumPy.cmake +++ b/cmake/FindNumPy.cmake @@ -26,7 +26,7 @@ if(PYTHON_EXECUTABLE) OUTPUT_VARIABLE NUMPY_PATH) elseif(_numpy_out) message(STATUS "Python executable not found.") -endif(PYTHON_EXECUTABLE) +endif() find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}") @@ -35,7 +35,7 @@ if(PYTHON_NUMPY_INCLUDE_DIR) set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found") -endif(PYTHON_NUMPY_INCLUDE_DIR) +endif() include(FindPackageHandleStandardArgs) find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 91464b84ef029..f84bb15d5922b 100755 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -14,19 +14,19 @@ if(NOT WITH_PYTHON) add_definitions(-DPADDLE_NO_PYTHON) -endif(NOT WITH_PYTHON) +endif() if(WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING) -endif(WITH_TESTING) +endif() if(WITH_INFERENCE_API_TEST) add_definitions(-DPADDLE_WITH_INFERENCE_API_TEST) -endif(WITH_INFERENCE_API_TEST) +endif() if(NOT WITH_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER) -endif(NOT WITH_PROFILER) +endif() if(WITH_AVX AND AVX_FOUND) set(SIMD_FLAG ${AVX_FLAG}) @@ -60,8 +60,8 @@ if(WIN32) FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA." ) - endif(NOT MSVC) -endif(WIN32) + endif() +endif() if(WITH_MUSL) add_definitions(-DPADDLE_WITH_MUSL) @@ -195,9 +195,9 @@ if(WITH_MKLML AND MKLML_IOMP_LIB) if(WIN32) # openmp not support well for now on windows set(OPENMP_FLAGS "") - else(WIN32) + else() set(OPENMP_FLAGS "-fopenmp") - endif(WIN32) + endif() set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") @@ -221,15 +221,15 @@ endif() if(WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA) -endif(WITH_BRPC_RDMA) +endif() if(ON_INFER) add_definitions(-DPADDLE_ON_INFERENCE) -endif(ON_INFER) +endif() if(WITH_CRYPTO) add_definitions(-DPADDLE_WITH_CRYPTO) -endif(WITH_CRYPTO) +endif() if(WITH_CUSTOM_DEVICE AND NOT WIN32) add_definitions(-DPADDLE_WITH_CUSTOM_DEVICE) diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake index 02c1a136280f7..9c28903498729 100644 --- a/cmake/coveralls.cmake +++ b/cmake/coveralls.cmake @@ -96,7 +96,7 @@ if(WITH_COVERAGE) if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1) list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH}) endif() - endforeach(TMP_PATH) + endforeach() endforeach() # convert to absolute path diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index aa958786cb8f4..87b943abd0106 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -274,7 +274,7 @@ set(CMAKE_CUDA_STANDARD 14) # So replace /W[1-4] with /W0 if(WIN32) string(REGEX REPLACE "/W[1-4]" " /W0 " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") -endif(WIN32) +endif() # in cuda9, suppress cuda warning on eigen set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w") # Set :expt-relaxed-constexpr to suppress Eigen warnings @@ -293,7 +293,7 @@ if(WIN32) if(${flag_var} MATCHES "-MD") string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}") endif() - endforeach(flag_var) + endforeach() endif() endif() diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 443b7aa7d56b7..31280a768b3a8 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -25,7 +25,8 @@ if(WIN32) elseif(LINUX) if(WITH_ROCM) # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ + # which will cause compiler error of using __host__ funciont + # in __host__ __device__ file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index a9942a6bca67b..df1b827ed1824 100755 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -28,12 +28,12 @@ if(WIN32) CACHE FILEPATH "glog library." FORCE) set(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") add_definitions("/DGOOGLE_GLOG_DLL_DECL=") -else(WIN32) +else() set(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) set(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -endif(WIN32) +endif() include_directories(${GLOG_INCLUDE_DIR}) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 90d61f47a52e8..14a8298790799 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -32,7 +32,8 @@ if(WIN32) set(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) else() #TODO(intel-huying): - # Now enable csrmm function in mklml library temporarily, it will be updated as offical version later. + # Now enable csrmm function in mklml library temporarily, + # it will be updated as offical version later. set(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE) @@ -51,8 +52,9 @@ message(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") set(MKLML_PREFIX_DIR ${THIRD_PARTY_PATH}/mklml) set(MKLML_SOURCE_DIR ${THIRD_PARTY_PATH}/mklml/src/extern_mklml) -# Ninja Generator can not establish the correct dependency relationship between the imported library with target, -# the product file in the ExternalProject need to be specified manually, please refer to +# Ninja Generator can not establish the correct dependency relationship +# between the imported library with target, the product file +# in the ExternalProject need to be specified manually, please refer to # https://stackoverflow.com/questions/54866067/cmake-and-ninja-missing-and-no-known-rule-to-make-it # It is the same to all other ExternalProject. ExternalProject_Add( diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 1cccfb86f4208..a93121e95c4e7 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -58,7 +58,7 @@ if(NOT WIN32) UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_BYPRODUCTS ${CBLAS_LIBRARIES}) -else(NOT WIN32) +else() set(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) @@ -92,4 +92,4 @@ else(NOT WIN32) BUILD_BYPRODUCTS ${CBLAS_LIBRARIES}) set(OPENBLAS_SHARED_LIB ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}) -endif(NOT WIN32) +endif() diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake index 75e2c42cb5a29..96f24bfc8a5bb 100644 --- a/cmake/external/paddle2onnx.cmake +++ b/cmake/external/paddle2onnx.cmake @@ -69,7 +69,7 @@ else() set(PADDLE2ONNX_COMPILE_LIB "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so" CACHE FILEPATH "paddle2onnx compile library." FORCE) -endif(WIN32) +endif() if(WIN32) set(PADDLE2ONNX_URL diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7c5de92362db4..6f9078c8eeecd 100755 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -16,7 +16,7 @@ include(ExternalProject) # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp if(NOT WIN32) find_package(Protobuf QUIET) -endif(NOT WIN32) +endif() unset_var(PROTOBUF_INCLUDE_DIR) unset_var(PROTOBUF_FOUND) @@ -147,7 +147,7 @@ set(PROTOBUF_ROOT CACHE PATH "Folder contains protobuf") if(WIN32) set(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) -endif(WIN32) +endif() if(NOT "${PROTOBUF_ROOT}" STREQUAL "") find_path( @@ -349,4 +349,4 @@ if(NOT PROTOBUF_FOUND) # `protoc.exe` existed before calling it. set(EXTERN_PROTOBUF_DEPEND extern_protobuf) prompt_protobuf_lib(extern_protobuf) -endif(NOT PROTOBUF_FOUND) +endif() diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index af27500398f57..7d1cca4feb6a6 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -134,9 +134,9 @@ if(WITH_XPU_BKCL) set(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") include_directories(${XPU_BKCL_INC_DIR}) target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}) -else(WITH_XPU_BKCL) +else() target_link_libraries(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) -endif(WITH_XPU_BKCL) +endif() add_dependencies(xpulib ${XPU_PROJECT}) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e3c5545df8b27..41a7b4a9d1cce 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -113,10 +113,10 @@ check_type_size(pthread_spinlock_t SPINLOCK_FOUND) check_type_size(pthread_barrier_t BARRIER_FOUND) if(SPINLOCK_FOUND) add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK) -endif(SPINLOCK_FOUND) +endif() if(BARRIER_FOUND) add_definitions(-DPADDLE_USE_PTHREAD_BARRIER) -endif(BARRIER_FOUND) +endif() set(CMAKE_EXTRA_INCLUDE_FILES "") # Only one sanitizer is allowed in compile time @@ -180,7 +180,7 @@ if(NOT WIN32) -Wno-parentheses # Warning in Eigen gcc 8.3 ) endif() - endif(NOT APPLE) + endif() set(GPU_COMMON_FLAGS -fPIC @@ -200,21 +200,21 @@ if(NOT WIN32) AND NOT WITH_MIPS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") endif() -endif(NOT WIN32) +endif() if(APPLE) if(WITH_ARM) set(CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Build architectures for OSX" FORCE) - else(WITH_ARM) + else() set(CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) - endif(WITH_ARM) + endif() # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 set(COMMON_FLAGS -Wno-deprecated-register) -endif(APPLE) +endif() if(WITH_HETERPS AND WITH_PSLIB) set(COMMON_FLAGS -D_GLIBCXX_USE_CXX11_ABI=0 ${COMMON_FLAGS}) @@ -224,7 +224,7 @@ endif() if(LINUX) set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS}) -endif(LINUX) +endif() foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 3514882c944de..44e9e2ee8ccaf 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -112,7 +112,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) list(APPEND HIP_CXX_FLAGS -g2) list(APPEND HIP_CXX_FLAGS -O0) list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling) -endif(CMAKE_BUILD_TYPE MATCHES Debug) +endif() set(HIP_HCC_FLAGS ${HIP_CXX_FLAGS}) set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index a8e3696418bd4..56345373dbe8c 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -59,14 +59,14 @@ function(copy TARGET) POST_BUILD COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst}) - else(WIN32) #not windows + else() #not windows add_custom_command( TARGET ${TARGET} POST_BUILD COMMAND mkdir -p "${dst}" COMMAND cp -r "${src}" "${dst}" COMMENT "copying ${src} -> ${dst}") - endif(WIN32) # not windows + endif() # not windows endforeach() endfunction() @@ -265,7 +265,7 @@ if(WIN32) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) -else(WIN32) +else() set(paddle_inference_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_inference.*) copy( @@ -273,7 +273,7 @@ else(WIN32) SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_inference_lib} DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib) -endif(WIN32) +endif() copy( inference_lib_dist @@ -350,11 +350,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) set(paddle_inference_c_lib $/paddle_inference_c.*) -else(WIN32) +else() set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.* ) -endif(WIN32) +endif() copy( inference_lib_dist @@ -436,7 +436,7 @@ set(module "platform") set(platform_lib_deps profiler_proto errors) if(WITH_GPU) set(platform_lib_deps ${platform_lib_deps} external_error_proto) -endif(WITH_GPU) +endif() add_dependencies(fluid_lib_dist ${platform_lib_deps}) copy( diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake index 392ff0401eaef..13d9563e3abdd 100644 --- a/cmake/miopen.cmake +++ b/cmake/miopen.cmake @@ -65,10 +65,9 @@ macro(find_miopen_version miopen_header_file) math(EXPR MIOPEN_VERSION "${MIOPEN_MAJOR_VERSION} * 1000 + ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}") message( - STATUS - "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h " - "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. " - ) + STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h " + "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.\ + ${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ") endif() endmacro() diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake index 8ce3cd91ac82a..eaa7bd23fd9b2 100644 --- a/cmake/nccl.cmake +++ b/cmake/nccl.cmake @@ -50,10 +50,8 @@ if(WITH_NCCL) endif() add_definitions("-DNCCL_VERSION_CODE=$NCCL_VERSION") - message( - STATUS - "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. " - "Current NCCL version is v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} " - ) + message(STATUS "Current NCCL header is ${NCCL_INCLUDE_DIR}/nccl.h. " + "Current NCCL version is \ + v${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}.${NCCL_PATCH_VERSION} ") endif() endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 4e0cc1027eff0..e8d7ba1401ebe 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -217,7 +217,7 @@ function(op_library TARGET) return() endif() endforeach() - endif(WIN32) + endif() # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake index 9367435b61b55..47e449c9dadb1 100644 --- a/cmake/python_module.cmake +++ b/cmake/python_module.cmake @@ -22,8 +22,8 @@ function(find_python_module module) set(PY_${module_upper} ${_${module}_location} CACHE STRING "Location of Python module ${module}") - endif(NOT _${module}_status) - endif(NOT PY_${module_upper}) + endif() + endif() find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper}) if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED) message(FATAL_ERROR "python module ${module} is not found") @@ -39,7 +39,7 @@ function(find_python_module module) set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING "Version of Python module ${module}") - endif(NOT _${module}_status) + endif() set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} @@ -47,4 +47,4 @@ function(find_python_module module) set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE) -endfunction(find_python_module) +endfunction() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index ff8b9d6f9a9b4..3d730657062a0 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -81,10 +81,10 @@ check_cxx_source_runs( #include int main() { - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; + __m256 a = _mm256_set_ps(-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps(1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps(a, b); + return 0; }" AVX_FOUND) diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 0cfc82709637f..21da7a0560ee3 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -14,7 +14,7 @@ if(WITH_DISTRIBUTE) DEPS phi_api eager_api gloo_wrapper) endif() -if(WITH_NCCL) +if(WITH_NCCL OR WITH_RCCL) cc_library( processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index f38ce8faa7ffb..5f1da003313ad 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -14,7 +14,13 @@ #pragma once +#ifdef PADDLE_WITH_CUDA #include +#endif +#ifdef PADDLE_WITH_HIP +#include +#endif + #include #include @@ -23,9 +29,19 @@ #include "paddle/fluid/distributed/collective/Types.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/variable.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" +#endif + #include "paddle/fluid/platform/device_context.h" + +#ifdef PADDLE_WITH_RCCL +#include "paddle/fluid/platform/dynload/rccl.h" +#else #include "paddle/fluid/platform/dynload/nccl.h" +#endif + #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -56,7 +72,11 @@ class EventManager { ~EventManager() { if (is_created_) { platform::CUDADeviceGuard guard(device_index_); +#ifdef PADDLE_WITH_HIP + hipEventDestroy(event_); +#else cudaEventDestroy(event_); +#endif } } @@ -94,24 +114,42 @@ class EventManager { device_index, device_index_)); platform::CUDADeviceGuard guard(device_index_); +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream())); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream())); +#endif } bool Query() const { +#ifdef PADDLE_WITH_HIP + gpuError_t err = hipEventQuery(event_); + if (err == hipSuccess) { + return true; + } + if (err == hipErrorNotReady) { + return false; + } +#else gpuError_t err = cudaEventQuery(event_); if (err == cudaSuccess) { return true; - } else if (err == cudaErrorNotReady) { - return false; - } else { - PADDLE_ENFORCE_GPU_SUCCESS(err); + } + if (err == cudaErrorNotReady) { return false; } +#endif + PADDLE_ENFORCE_GPU_SUCCESS(err); + return false; } void Synchronize() const { if (is_created_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_)); +#else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); +#endif } } @@ -124,12 +162,22 @@ class EventManager { "Event's device %d", device_index, device_index_)); platform::CUDADeviceGuard guard(device_index_); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0)); +#else PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0)); +#endif } } private: +#ifdef PADDLE_WITH_HIP + unsigned int flags_ = hipEventDefault; +#else unsigned int flags_ = cudaEventDefault; +#endif + bool is_created_{false}; gpuEvent_t event_{}; int8_t device_index_{0}; @@ -138,7 +186,13 @@ class EventManager { void CreateEvent(int device_index) { device_index_ = device_index; platform::CUDADeviceGuard guard(device_index); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_)); +#else PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_)); +#endif + is_created_ = true; } }; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index dc67205c78f56..793f8dacbf8d4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -95,7 +95,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // If we use the work to do barrier, we should block cpu for (auto& place : places_) { platform::CUDADeviceGuard gpuGuard(place); +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); +#endif } } return true; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 2325e645b4c46..c56f75b46518c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -30,8 +30,13 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/distributed/collective/NCCLTools.h" +#endif + +#ifdef PADDLE_WITH_RCCL +#include "paddle/fluid/platform/dynload/rccl.h" +#else #include "paddle/fluid/platform/dynload/nccl.h" #endif diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 73baf21015833..e92e1e12b8991 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1152,7 +1152,8 @@ static std::string GenerateGradNodeCreationContent( size_t bwd_in_slot_num = out_vars.size(); size_t bwd_out_slot_num = in_vars.size(); const char* GRAD_OP_NODE_TEMPLATE = - " auto grad_node = std::shared_ptr(new GradNode%s(%d, " + " auto grad_node = std::shared_ptr<%sGradNodeCompat>(new " + "%sGradNodeCompat(%d, " "%d));\n"; grad_node_creation_str += " // Create GradOpNode\n"; grad_node_creation_str += @@ -2080,10 +2081,8 @@ static std::string GenerateSingleOpBase( generated_grad_function_body += " paddle::small_vector, " "egr::kSlotSmallVectorSize> " + - hooked_grads + - " = " - "GradNode" + - fwd_op_type + "::ApplyGradientHooks(grads);\n"; + hooked_grads + " = " + fwd_op_type + + "GradNodeCompat::ApplyGradientHooks(grads);\n"; // [Generation] Get Ins Map std::unordered_set dispensable_input_name_set; @@ -2547,7 +2546,7 @@ static std::string GenerateGradNodeCCContents( */ const char* EAGER_LOG_TEMPLATE = - " VLOG(3) << \"Running Eager Backward Node: GradNode%s\";\n"; + " VLOG(3) << \"Running Eager Backward Node: %sGradNodeCompat\";\n"; std::string generated_grad_function_body = paddle::string::Sprintf(EAGER_LOG_TEMPLATE, fwd_op_type); @@ -2616,7 +2615,7 @@ static std::string GenerateGradNodeCCContents( const char* GRAD_FUNCTION_TEMPLATE = "paddle::small_vector, " "egr::kSlotSmallVectorSize> " - "GradNode%s::operator()(" + "%sGradNodeCompat::operator()(" "paddle::small_vector, " "egr::kSlotSmallVectorSize>& grads, bool " "create_graph, bool is_new_grad) {\n" @@ -2645,14 +2644,15 @@ static std::string GenerateGradNodeHeaderContents( VLOG(6) << "Generating Grad Node Header"; const char* GRAD_NODE_TEMPLATE = - "class GradNode%s : public egr::GradNodeBase {\n" + "class %sGradNodeCompat : public egr::GradNodeBase {\n" " public:\n" - " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct " - "GradNode%s \"; }\n" - " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " + " %sGradNodeCompat() : egr::GradNodeBase() { VLOG(7) << \" Construct " + "%sGradNodeCompat \"; }\n" + " %sGradNodeCompat(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" " - "Construct GradNode%s \"; }\n" - " ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n" + "Construct %sGradNodeCompat \"; }\n" + " ~%sGradNodeCompat() override { VLOG(6) << \" Destruct " + "%sGradNodeCompat \"; }\n" "\n" " virtual " "paddle::small_vector, " @@ -2667,11 +2667,11 @@ static std::string GenerateGradNodeHeaderContents( "%s\n" " SetIsTensorWrappersCleared(true);\n" " }\n" - " std::string name() override { return \"GradNode%sMid\"; } \n " + " std::string name() override { return \"%sGradNodeCompat\"; } \n " "\n" "std::shared_ptr Copy() const override {{\n " - " auto copied_node = std::shared_ptr(new " - "GradNode%s(*this));\n " + " auto copied_node = std::shared_ptr<%sGradNodeCompat>(new " + "%sGradNodeCompat(*this));\n " " return copied_node;\n " "}}\n " "\n" diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 87b2ff986dc92..dee3b3d79a2e7 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -1,11 +1,11 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -147,7 +147,18 @@ def RemoveConstAndReference(string): def GetGradNodeName(string): - return f"GradNode{string}Final" + + def str2Hump(text): + arr = filter(None, text.split('_')) + res = '' + for i in arr: + res = res + i[0].upper() + i[1:] + return res + + string = str2Hump(string) + if string.rfind("Grad") == (len(string) - 4): + string = string[:-4] + return f"{string}GradNodeFinal" def GetDygraphForwardFunctionName(string): @@ -335,6 +346,7 @@ def ParseYamlInplaceInfo(string): ### Generator Base ### ######################## class FunctionGeneratorBase: + def __init__(self, forward_api_contents, namespace): self.forward_api_contents = forward_api_contents self.namespace = namespace @@ -357,7 +369,7 @@ def __init__(self, forward_api_contents, namespace): # Special Op Attributes self.optional_inputs = [] #[name, ...] self.no_need_buffers = [] #[name, ...] - self.intermediate_outputs = [] #[name, ...] + self.intermediate_outputs = [] #[name, ...] self.forward_inplace_map = {} #{name : name, ...} def ParseForwardInplaceInfo(self): @@ -423,8 +435,9 @@ def DetermineForwardPositionMap(self, forward_inputs_list, input_type = forward_input[1] input_pos = forward_input[2] - self.forward_inputs_position_map[ - input_name] = [input_type, input_pos] + self.forward_inputs_position_map[input_name] = [ + input_type, input_pos + ] for i in range(len(forward_returns_list)): forward_return = forward_returns_list[i] @@ -432,11 +445,13 @@ def DetermineForwardPositionMap(self, forward_inputs_list, return_type = forward_return[1] return_pos = forward_return[2] - self.forward_outputs_position_map[ - return_name] = [return_type, return_pos] + self.forward_outputs_position_map[return_name] = [ + return_type, return_pos + ] class GeneratorBase: + def __init__(self, api_yaml_path): self.namespace = "" self.api_yaml_path = api_yaml_path diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d23d71b07626d..c0feecd2e9e20 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -411,6 +411,7 @@ def GenerateCoreOpInfoDefinition(): ## Generator Class ## ##################### class DygraphFunctionGeneratorBase(FunctionGeneratorBase): + def __init__(self, forward_api_contents, grad_api_contents, namespace): self.forward_api_contents = forward_api_contents # Members from Parent: @@ -532,8 +533,8 @@ def ForwardsValidationCheck(self): max_input_position = max(max_input_position, pos) for _, _, _, pos in forward_attrs_list: - assert pos > max_input_position, AssertMessage(pos, - max_input_position) + assert pos > max_input_position, AssertMessage( + pos, max_input_position) def BackwardValidationCheck(self): backward_forward_inputs_map = self.backward_forward_inputs_map @@ -678,7 +679,7 @@ def GenerateNodeCreationCodes(self): # Node Construction num_backward_inputs = len(forward_outputs_position_map.keys()) num_backward_outputs = len(forward_inputs_position_map.keys()) - grad_node_name = GetGradNodeName(forward_api_name) + grad_node_name = GetGradNodeName(self.backward_api_name) # Helper indent = GetIndent(2) @@ -845,6 +846,7 @@ def run(self): class DygraphForwardFunctionGenerator(DygraphFunctionGeneratorBase): + def __init__(self, forward_api_contents, grad_api_contents, namespace): DygraphFunctionGeneratorBase.__init__(self, forward_api_contents, grad_api_contents, namespace) @@ -947,12 +949,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): if is_inplaced and len(forward_outputs_position_map) == 1: api_out_type = "auto&" forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str});" - num_outputs = len(forward_outputs_position_map.keys()) - len( - intermediate_outputs) + num_outputs = len( + forward_outputs_position_map.keys()) - len(intermediate_outputs) # Check Nan and Inf - check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(function_name, - "api_result") + check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format( + function_name, "api_result") # Get Outputs get_outputs_str = "" @@ -1007,8 +1009,8 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced): if pos == corresponding_pos: has_corresponding_grad_output = True if has_corresponding_grad_output or ( - name in forward_inplace_map and - forward_api_name not in inplace_check_blacklist): + name in forward_inplace_map + and forward_api_name not in inplace_check_blacklist): input_autograd_meta_name = GetAutoGradMetaName(name) if IsPlainTensorType(ttype): input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({name});" @@ -1116,17 +1118,20 @@ def UpdateCoreOpsInformation(self, is_inplaced): forward_outputs_position_map = self.forward_outputs_position_map forward_attrs_list = self.forward_attrs_list - num_args = len(forward_inputs_position_map.keys()) + len( - forward_attrs_list) + num_args = len( + forward_inputs_position_map.keys()) + len(forward_attrs_list) num_returns = len(forward_outputs_position_map.keys()) final_state_fwd_api_name = "final_state_" + forward_api_name - core_ops_returns_info[ - final_state_fwd_api_name] = ["" for i in range(num_returns)] - core_ops_args_info[ - final_state_fwd_api_name] = ["" for i in range(num_args)] - core_ops_args_type_info[ - final_state_fwd_api_name] = ["" for i in range(num_args)] + core_ops_returns_info[final_state_fwd_api_name] = [ + "" for i in range(num_returns) + ] + core_ops_args_info[final_state_fwd_api_name] = [ + "" for i in range(num_args) + ] + core_ops_args_type_info[final_state_fwd_api_name] = [ + "" for i in range(num_args) + ] for name, (ttype, pos) in forward_inputs_position_map.items(): core_ops_args_info[final_state_fwd_api_name][pos] = name @@ -1159,6 +1164,7 @@ def run(self): class DygraphNodeGenerator(DygraphFunctionGeneratorBase): + def __init__(self, forward_api_contents, grad_api_contents, @@ -1167,7 +1173,7 @@ def __init__(self, DygraphFunctionGeneratorBase.__init__(self, forward_api_contents, grad_api_contents, namespace) - # Record name mapping from forward_api_name to grad_api_names + # Record name mapping from forward_var_name to grad_var_names self.to_next_grad_name_mapping = {} # {name : name} # Generated Results @@ -1281,7 +1287,7 @@ def GenerateNodeDeclaration(self): attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( RemoveConstAndReference(atype), saved_attr_name) - grad_node_name = GetGradNodeName(forward_op_name) + grad_node_name = GetGradNodeName(self.backward_api_name) self.node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, clear_tensor_wrapper_str, grad_node_name, @@ -1447,8 +1453,8 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, {indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});""" # Check Nan and Inf - check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(backward_api_name, - "returns") + check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format( + backward_api_name, "returns") # Prepare for Node Creation if Necessary inputs_autograd_meta_str = "" @@ -1533,7 +1539,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"{indent}return returns;\n" - grad_node_name = GetGradNodeName(forward_api_name) + grad_node_name = GetGradNodeName(self.backward_api_name) self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format( grad_node_name, fill_zero_str, get_grad_in_args_str, grad_node_name, @@ -1560,6 +1566,7 @@ def run(self): class DygraphForwardAndNodesGenerator(GeneratorBase): + def __init__(self, api_yaml_path, backward_yaml_path): # Parent members: # self.namespace @@ -1617,9 +1624,10 @@ def GenerateCode(self): next_grad_api_contents = self.GetBackwardAPIContents( backward_api_contents) - node_generator = DygraphNodeGenerator( - forward_api_contents, backward_api_contents, namespace, - next_grad_api_contents) + node_generator = DygraphNodeGenerator(forward_api_contents, + backward_api_contents, + namespace, + next_grad_api_contents) node_generator.run() self.node_declaration_str += node_generator.node_declaration_str + "\n" self.node_definition_str += node_generator.node_definition_str + "\n" diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 36cfb4db1137a..09bbc2a0ba40a 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -536,7 +536,7 @@ std::vector RunBackward( const std::vector& inputs = {}, bool allow_unused = false, const std::vector& no_grad_vars = {}) { - VLOG(6) << "Start Backward"; + VLOG(3) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level @@ -634,7 +634,7 @@ std::vector RunBackward( GeneralGrad::Instance().ReconstructBackwardGraph(orig_queue); } - VLOG(6) << "Update In degree Map for backward"; + VLOG(3) << "Update In degree Map for backward"; // 3. Compute in_degree for each node std::unordered_map node_in_degree_map = getInDegreeMap(queue); @@ -654,7 +654,7 @@ std::vector RunBackward( // |- node(grads) // |- Prepare for next node // 3. Update queue - VLOG(6) << "Run Backward"; + VLOG(3) << "Run Backward"; while (!queue.empty()) { GradNodeBase* node = queue.front(); VLOG(6) << "Running GradNode:" << node->name(); @@ -739,7 +739,7 @@ std::vector RunBackward( // Since we make edge has as same rank as bwd outputs, we indexing them // with the same rank(i, j) auto next_node_shared = edge.GetMutableGradNode(); - + VLOG(3) << "Found pending node: " << next_node_shared->name(); // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs @@ -826,7 +826,7 @@ void Backward( const std::vector& tensors, // outputs const std::vector& grad_tensors, bool retain_graph) { - VLOG(6) << "Run in Backward"; + VLOG(3) << "Run in Backward"; paddle::platform::RecordEvent backward_record_event( "backward", paddle::platform::TracerEventType::Operator, 1); RunBackward(tensors, grad_tensors, retain_graph); @@ -839,7 +839,7 @@ std::vector Grad( const std::vector& grad_tensors, bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, const std::vector& no_grad_vars) { - VLOG(6) << "Run in Grad"; + VLOG(3) << "Run in Grad"; DuplicateCheck(inputs, true /* is_input */); DuplicateCheck(tensors, false /* is_input */); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc index 3efcf3b21a4e3..beff23d433421 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -263,9 +263,9 @@ RunCustomOpNode::operator()( trace_backward, &(ins_auto_grad_metas[i])); } - if (require_any_grad) { - auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); - const auto& vec_map = meta_info_map.at(op_type_); + auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); + const auto& vec_map = meta_info_map.at(op_type_); + if (require_any_grad && (vec_map.size() > 2)) { paddle::platform::RecordEvent node_creation_record_event( "Custom Op " + op_type_ + " double_grad node_creation", paddle::platform::TracerEventType::OperatorInner, 1); diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 01b31a2500cb0..1cba94339bfdf 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -230,7 +230,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, fwd_in_meta->SetGradNode( std::make_shared(fwd_in_meta)); } - VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from " + VLOG(3) << "Add Edges for slot: " << slot_rank << ", the Edge is from " << this->name() << " (addr: " << this << ") " << " to " << fwd_in_meta->GetMutableGradNode()->name() << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")"; @@ -286,7 +286,7 @@ void GradNodeBase::SetGradOutMeta( fwd_in_meta->SetGradNode( std::make_shared(fwd_in_meta)); } - VLOG(6) << "Add Edges for slot: " << slot_rank << ", the Edge is from " + VLOG(3) << "Add Edges for slot: " << slot_rank << ", the Edge is from " << this->name() << " (addr: " << this << ") " << " to " << fwd_in_meta->GetMutableGradNode()->name() << " (addr: " << fwd_in_meta->GetMutableGradNode().get() << ")"; diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 7f5ae233874b6..ee5dd622412e1 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -77,6 +77,8 @@ void GradTensorHolder::CopyValueFromTensor( "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR " "now.")); } + egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank])) + ->SetStopGradient(false); } } } @@ -84,8 +86,6 @@ void GradTensorHolder::CopyValueFromTensor( void GradTensorHolder::add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t, bool create_graph) { - // TODO(jiabin): We need to deal with empty input_buffer with slot size not - // empty; PADDLE_ENFORCE(slot_id < buffer_.size(), paddle::platform::errors::Fatal( "Invalid slot_id for GradTensorHolder::add() " diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index d1c5983a3702f..0ed1a198c916d 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -114,6 +114,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const TupleOfTensorAndVector& tensors) { CheckTensorHasNanOrInf(api_name, std::get<0>(tensors)); CheckTensorHasNanOrInf(api_name, std::get<1>(tensors)); + CheckTensorHasNanOrInf(api_name, std::get<2>(tensors)); } } // namespace egr diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h index a411504fa4900..815e3bd6cd14f 100644 --- a/paddle/fluid/eager/nan_inf_utils.h +++ b/paddle/fluid/eager/nan_inf_utils.h @@ -31,7 +31,8 @@ using TupleOfFourTensors = std::tuple; using TupleOfFiveTensors = std::tuple; using TupleOfSixTensors = std::tuple; -using TupleOfTensorAndVector = std::tuple>; +using TupleOfTensorAndVector = + std::tuple, std::vector>; void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor); diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 3b22a4b0d5d7a..42235b7c484e3 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -5,7 +5,7 @@ if(WITH_PSLIB) if(NOT WITH_HETERPS) set(BRPC_DEPS brpc) endif() - endif(WITH_PSLIB_BRPC) + endif() cc_library( fleet_wrapper SRCS fleet_wrapper.cc @@ -21,7 +21,7 @@ else() fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) -endif(WITH_PSLIB) +endif() if(WITH_HETERPS) if(WITH_NCCL AND WITH_GPU) @@ -48,7 +48,7 @@ else() ps_gpu_wrapper SRCS ps_gpu_wrapper.cc DEPS gloo_wrapper) -endif(WITH_HETERPS) +endif() if(WITH_NCCL OR WITH_RCCL) cc_library( @@ -74,7 +74,7 @@ else() box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor) -endif(WITH_BOX_PS) +endif() if(WITH_GLOO) cc_library( @@ -94,7 +94,7 @@ else() metrics SRCS metrics.cc DEPS gloo_wrapper) -endif(WITH_GLOO) +endif() if(WITH_PSLIB) set(DISTRIBUTE_COMPILE_FLAGS diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 72b7477f2b870..dbea438b14048 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -27,9 +27,18 @@ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +// The difference between "sequential_run" and "serial_run": +// "sequential_run" dispatches OPs one by one according to the sequence in the +// Program, while "serial_run" ensures that all Ops are scheduled in a singal +// thread. In standalone executor, "sequential_run" is also "serial_run", while +// "serial_run" is not necessarily "sequential_run". +PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run, false, + "Enable sequential execution for standalone " + "executor, only applied to GPU OPs."); + PADDLE_DEFINE_EXPORTED_bool( - new_executor_sequential_run, false, - "Enable sequential execution for standalone executor, used for debug"); + new_executor_serial_run, false, + "Enable serial execution for standalone executor, used for debug."); DECLARE_bool(use_mkldnn); @@ -42,10 +51,8 @@ constexpr size_t kPrepareWorkQueueIdx = 2; void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type, std::function fn) { VLOG(4) << "Add task: " << static_cast(op_func_type) << " "; - // NOTE(zhiqiu): use thhe second queue of size of, so only one thread is used. - if (FLAGS_new_executor_sequential_run) { - VLOG(4) << "FLAGS_new_executor_sequential_run:" - << FLAGS_new_executor_sequential_run; + // NOTE(zhiqiu): use the second queue of size of, so only one thread is used. + if (FLAGS_new_executor_serial_run) { queue_group_->AddTask(static_cast(OpFuncType::kQueueAsync), std::move(fn)); } else { @@ -789,12 +796,14 @@ std::map> build_op_downstream_map( std::set remove_duplicate; // remove the duplicate between inputs and outputs + size_t op_num = vec_instruction.size(); + // reserve - for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { op2dependences[op_idx] = std::set(); } - for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { remove_duplicate.clear(); // step1: update the op2dependences structure for (auto& item : @@ -859,8 +868,7 @@ std::map> build_op_downstream_map( std::map> op_downstream_map = GetDownstreamMap(op2dependences); - ShrinkDownstreamMap(&op_downstream_map, op_happens_before, - vec_instruction.size()); + ShrinkDownstreamMap(&op_downstream_map, op_happens_before, op_num); // add dependences for random op, make sure that the random op is scheduled // sequentially @@ -880,7 +888,7 @@ std::map> build_op_downstream_map( }; int dependence_op_idx = -1; - for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { if (random_op_set.count(vec_instruction[op_idx].OpBase()->Type())) { if (dependence_op_idx != -1) { AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map, @@ -907,7 +915,7 @@ std::map> build_op_downstream_map( }; dependence_op_idx = -1; - for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { if (is_comm_op(vec_instruction[op_idx].OpBase()->Type())) { if (dependence_op_idx != -1) { AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map, @@ -931,7 +939,7 @@ std::map> build_op_downstream_map( // c_sync_comm_stream(a) const std::string kSyncComm = "c_sync_comm_stream"; dependence_op_idx = -1; - for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { if (vec_instruction[op_idx].OpBase()->Type() == kSyncComm) { dependence_op_idx = op_idx; } else { @@ -947,7 +955,7 @@ std::map> build_op_downstream_map( // add dependency for coalesce_tensor const std::string kCoalesceTensor = "coalesce_tensor"; - for (size_t op_idx = 0; op_idx < vec_instruction.size(); ++op_idx) { + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { if (vec_instruction[op_idx].OpBase()->Type() == kCoalesceTensor) { VLOG(4) << "Add depend for " << kCoalesceTensor << " " << op_idx; auto fused_out = vec_instruction[op_idx].Outputs().at("FusedOutput")[0]; @@ -977,7 +985,7 @@ std::map> build_op_downstream_map( // find first op that reads fused_out auto first_read_fused_out_op = -1; - for (auto j = op_idx + 1; j < vec_instruction.size(); ++j) { + for (auto j = op_idx + 1; j < op_num; ++j) { if (is_read(vec_instruction[j], fused_out)) { first_read_fused_out_op = j; break; @@ -1017,8 +1025,7 @@ std::map> build_op_downstream_map( // we should take the last one to add depned instead of // 'first_read_fused_out_op' size_t target = first_read_fused_out_op; - for (size_t j = first_read_fused_out_op + 1; j < vec_instruction.size(); - ++j) { + for (size_t j = first_read_fused_out_op + 1; j < op_num; ++j) { if (j == target + 1 && is_comm_op(vec_instruction[target].OpBase()->Type()) && is_comm_op(vec_instruction[j].OpBase()->Type())) { @@ -1032,7 +1039,6 @@ std::map> build_op_downstream_map( for (auto var_id : outputs) { if (is_read(vec_instruction[j], var_id)) { AddDownstreamOp(target, j, &op_downstream_map, *op_happens_before); - op2dependences[j].insert(target); VLOG(4) << target << " -> " << j; VLOG(4) << "Add depend from " << vec_instruction[target].OpBase()->Type() << " to " @@ -1043,6 +1049,24 @@ std::map> build_op_downstream_map( } } + if (FLAGS_new_executor_sequential_run) { + dependence_op_idx = -1; + for (size_t op_idx = 0; op_idx < op_num; ++op_idx) { + if (!IsCpuOp(vec_instruction[op_idx])) { + if (dependence_op_idx != -1) { + AddDownstreamOp(dependence_op_idx, op_idx, &op_downstream_map, + *op_happens_before); + VLOG(4) << "Add depend from " + << vec_instruction[dependence_op_idx].OpBase()->Type() << "(" + << dependence_op_idx << ") to " + << vec_instruction[op_idx].OpBase()->Type() << "(" << op_idx + << ")"; + } + dependence_op_idx = op_idx; + } + } + } + VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map); VLOG(8) << "downstream_map: " << std::endl << StringizeDownstreamMap(op_downstream_map); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc index 24e65599018fa..e804d153f2814 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc @@ -12,17 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" + #include #include #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/phi/core/ddim.h" -// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc index 8a6f92a6f45d0..68c701530a12d 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc @@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// clang-format off -#include "gtest/gtest.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" +#include "gtest/gtest.h" #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" -// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc index ae9f51c3f6790..1ee108f566f5f 100644 --- a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc +++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc @@ -12,12 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off +#include "paddle/fluid/framework/paddle2cinn/transform_desc.h" + #include #include "gtest/gtest.h" -#include "paddle/fluid/framework/paddle2cinn/transform_desc.h" -// clang-format on namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc index 88738255af78e..f4af3c5eba00e 100644 --- a/paddle/fluid/framework/program_desc.cc +++ b/paddle/fluid/framework/program_desc.cc @@ -59,9 +59,12 @@ ProgramDesc::ProgramDesc() { ProgramDesc::ProgramDesc(const ProgramDesc &o) { desc_ = o.desc_; + std::vector old_block_desc; for (int i = 0; i < desc_.blocks_size(); ++i) { auto *block = desc_.mutable_blocks(i); blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); + // record all block desc's ptr from origin program + old_block_desc.emplace_back(o.blocks_[i].get()); } for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) { auto all_ops = blocks_[block_id]->AllOps(); @@ -70,9 +73,21 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) { for (const std::string &attr_name : op->AttrNames()) { if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { - int sub_block_id = - o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); - op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); + framework::BlockDesc *block_desc = + BOOST_GET_CONST(framework::BlockDesc *, op->GetAttr(attr_name)); + if (std::find(old_block_desc.begin(), old_block_desc.end(), + block_desc) != old_block_desc.end()) { + // The block is owned by the origin program. Just use id to get + // the corresponding block. + int sub_block_id = + o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); + op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); + } else { + // The block is not owned by the origin program. Should copy + // the real block desc instead of logical block in the program. + VLOG(3) << "Set op's block attr with the original block"; + op->SetBlockAttr(attr_name, block_desc); + } } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) { std::vector sub_block_ids = o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name); diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index a4baca6f25724..5fd8eae852859 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -1085,7 +1085,7 @@ void PartialGradEngine::Clear() { void PartialGradEngine::Execute() { PADDLE_ENFORCE_NOT_NULL(task_, platform::errors::PermissionDenied( "PartialGradEngine has been destructed")); - VLOG(10) << "Starts to execute PartialGradEngine"; + VLOG(3) << "Starts to execute PartialGradEngine"; results_ = task_->Run(); Clear(); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index a29e530b2b80c..4e991a3013875 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -97,7 +97,7 @@ set(SHARED_INFERENCE_DEPS ${fluid_modules} phi analysis_predictor if(WITH_CRYPTO) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto) -endif(WITH_CRYPTO) +endif() if(WITH_PSCORE) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service @@ -108,7 +108,7 @@ if(WITH_ONNXRUNTIME) set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc) -endif(WITH_ONNXRUNTIME) +endif() # Create shared inference library cc_library( diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index f374c5c7cc20f..4b7bed65bab77 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -87,7 +87,7 @@ function(inference_analysis_test TARGET) inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS ${analysis_test_ARGS}) endif() -endfunction(inference_analysis_test) +endfunction() if(NOT APPLE AND NOT WIN32) inference_analysis_test( diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e25c5e963982f..cace195640f64 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -15,7 +15,7 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") -endif(APPLE) +endif() add_subdirectory(details) @@ -84,14 +84,14 @@ if(WITH_ONNXRUNTIME) infer_io_utils onnxruntime paddle2onnx) -else(WITH_ONNXRUNTIME) +else() cc_library( analysis_predictor SRCS analysis_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) -endif(WITH_ONNXRUNTIME) +endif() cc_test( test_paddle_inference_api diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 0645af611b9d2..c41b667e18a83 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1960,6 +1960,8 @@ USE_TRT_CONVERTER(strided_slice) USE_TRT_CONVERTER(transformer_input_convert) USE_TRT_CONVERTER(recover_padding) USE_TRT_CONVERTER(remove_padding) +USE_TRT_CONVERTER(top_k) +USE_TRT_CONVERTER(top_k_v2) #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) USE_TRT_CONVERTER(sparse_fc) USE_TRT_CONVERTER(sparse_multihead_matmul) diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index a76ed63f10646..c58aad36c97d2 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -21,8 +21,8 @@ macro(safe_set_static_flag) CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) if(${flag_var} MATCHES "/MD") string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) + endif() + endforeach() endmacro() if(NOT DEFINED PADDLE_LIB) @@ -105,7 +105,7 @@ if(WITH_GPU) endif() endif() message(STATUS "Current CUDA lib path: ${CUDA_LIB}") - endif(NOT WIN32) + endif() endif() if(USE_TENSORRT AND WITH_GPU) @@ -157,9 +157,9 @@ if(WITH_MKL) include_directories("${MKLDNN_PATH}/include") if(WIN32) set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) - else(WIN32) + else() set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) - endif(WIN32) + endif() endif() else() set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") @@ -232,7 +232,7 @@ else() utf8proc_static ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) -endif(NOT WIN32) +endif() if(WITH_GPU) if(NOT WIN32) diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index c1ff6ea68a2bd..2acd96b3fb97c 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -26,13 +26,13 @@ if(WITH_ONNXRUNTIME) zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc DEPS onnxruntime) -else(WITH_ONNXRUNTIME) +else() cc_library( zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) -endif(WITH_ONNXRUNTIME) +endif() cc_test( zero_copy_tensor_test diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 2c9ba42821535..52a3c1df9a925 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -60,7 +60,8 @@ list( roll_op.cc transformer_input_convert_op.cc remove_padding_op.cc - recover_padding_op.cc) + recover_padding_op.cc + top_k_op.cc) if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc) diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 077ba32ba89c1..f6ecf76d01675 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -295,20 +295,215 @@ class OpConverter { engine->ClearWeights(); } + // rank(result) = rank(input) + nvinfer1::ITensor* Gather(nvinfer1::ITensor* input, + const std::vector indices, int axis = 0) { + auto* indices_tensor = Add1DConstantLayer(indices, " "); + auto* result = + TRT_ENGINE_ADD_LAYER(engine_, Gather, *input, *indices_tensor, axis) + ->getOutput(0); + return result; + } + + // paddle allows negative index + // for axis length = 5, paddle allows [-5, 4] + nvinfer1::ITensor* FixNegIndices(nvinfer1::ITensor* input_shape, + nvinfer1::ITensor* indices) { + int rank = input_shape->getDimensions().nbDims; + std::vector zero = std::vector(rank, 0); + std::vector minus_one = std::vector(rank, -1); + nvinfer1::ITensor* zero_tensor = Add1DConstantLayer(zero); + nvinfer1::ITensor* minus_one_tensor = Add1DConstantLayer(minus_one); + // -1, 0 + auto* sign = Max(Min(indices, zero_tensor), minus_one_tensor); + return Sub(indices, Prod(sign, input_shape)); + } + + nvinfer1::ITensor* Shape(nvinfer1::ITensor* input) { + return TRT_ENGINE_ADD_LAYER(engine_, Shape, *input)->getOutput(0); + } + + // Concat not make rank changed + nvinfer1::ITensor* Concat(const std::vector& inputs, + int axis = 0) { + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, inputs.data(), + inputs.size()); + if (axis != 0) layer->setAxis(axis); + nvinfer1::ITensor* c = layer->getOutput(0); + return c; + } + + nvinfer1::ITensor* Sum(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b, + nvinfer1::ElementWiseOperation::kSUM) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Prod(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b, + nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Min(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b, + nvinfer1::ElementWiseOperation::kMIN) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Max(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b, + nvinfer1::ElementWiseOperation::kMAX) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Sub(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b, + nvinfer1::ElementWiseOperation::kSUB) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Div(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *a, *b, + nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Act(nvinfer1::ITensor* a, + nvinfer1::ActivationType act_type) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, Activation, *a, act_type)->getOutput(0); + return c; + } + + // Get element tensor of 1D shape tensor + nvinfer1::ITensor* GetEleTensorOfShape(nvinfer1::ITensor* shape_tensor, + int index, bool is_scalar = false) { + auto* tensor = + TRT_ENGINE_ADD_LAYER(engine_, Gather, *shape_tensor, + *Add1DConstantLayer(index, " ", is_scalar), 0) + ->getOutput(0); + return tensor; + } + + // Create and add Multi-D constant float layer + nvinfer1::ITensor* AddConstantLayer(const float* data, + const std::vector& weight_dims, + const std::string& weight_name) { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = std::accumulate(weight_dims.begin(), weight_dims.end(), 1, + std::multiplies()); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + engine_->SetWeights(weight_name, std::move(tmp_tensor)); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(tmp_data), + static_cast(data_size)}; + nvinfer1::Dims trt_dims; + trt_dims.nbDims = weight_dims.size(); + for (size_t i = 0; i < weight_dims.size(); i++) + trt_dims.d[i] = weight_dims[i]; + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims, weight.get()); + return const_layer->getOutput(0); + } + + // Create and add 1D constant float layer + nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, + const std::string& weight_name = "", + bool scalar = false) { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = data.size(); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + engine_->SetWeights(weight_name, std::move(tmp_tensor)); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(tmp_data), + static_cast(data_size)}; + nvinfer1::Dims input_shape; + input_shape.nbDims = scalar ? 0 : 1; + input_shape.d[0] = data_size; + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); + return const_layer->getOutput(0); + } + + // Create and add 1D constant layer + nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, + const std::string& weight_name = "", + bool scalar = false) { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = data.size(); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + engine_->SetWeights(weight_name, std::move(tmp_tensor)); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32, + static_cast(tmp_data), + static_cast(data_size)}; + nvinfer1::Dims input_shape; + input_shape.nbDims = scalar ? 0 : 1; + input_shape.d[0] = data_size; + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); + return const_layer->getOutput(0); + } + + nvinfer1::ITensor* Add1DConstantLayer(nvinfer1::Dims data, + const std::string& weight_name = "", + bool scalar = false) { + std::vector tmp_data; + for (int i = 0; i < data.nbDims; i++) tmp_data.push_back(data.d[i]); + return Add1DConstantLayer(tmp_data, weight_name, scalar); + } + + nvinfer1::ITensor* Add1DConstantLayer(int32_t data, + const std::string& weight_name = "", + bool scalar = false) { + std::vector tmp_data; + tmp_data.push_back(data); + return Add1DConstantLayer(tmp_data, weight_name, scalar); + } + void RreplenishLayerAndOutput( nvinfer1::ILayer* layer, const std::string& layer_type, const std::vector& output_tensor_names, bool test_mode = false) { size_t num_out = output_tensor_names.size(); + std::string layer_name = layer_type + " (Output: "; for (size_t i = 0; i < num_out; i++) { layer->getOutput(i)->setName(output_tensor_names[i].c_str()); engine_->SetITensor(output_tensor_names[i], layer->getOutput(i)); if (test_mode) { engine_->DeclareOutput(output_tensor_names[i]); } + layer_name += output_tensor_names[i]; + if (i != num_out - 1) layer_name += ", "; } - layer->setName( - (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str()); + layer->setName((layer_name + ")").c_str()); } void SetEngine(TensorRTEngine* engine) { engine_ = engine; } diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 591eb06a36202..1638515ffc47f 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -29,7 +29,6 @@ class SplitOpConverter : public OpConverter { // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("X")[0]); auto input_dims = input->getDimensions(); - size_t input_num = op_desc.Input("X").size(); size_t output_num = op_desc.Output("Out").size(); // Get Attrs @@ -41,48 +40,115 @@ class SplitOpConverter : public OpConverter { if (op_desc.HasAttr("num")) { num = BOOST_GET_CONST(int, op_desc.GetAttr("num")); } - + nvinfer1::ITensor* shape_tensor = nullptr; if (engine_->with_dynamic_shape()) { -#if IS_TRT_VERSION_GE(6000) axis += (axis < 0) ? input_dims.nbDims : 0; -#endif + // only be called in dynamic_shape mode + shape_tensor = Shape(input); } else { axis += (axis < 0) ? input_dims.nbDims : -1; } - if (num > 0) { - int64_t in_axis_dim = input_dims.d[axis]; - size_t out_axis_dim = in_axis_dim / num; - for (int i = 0; i < num; ++i) { - output_lengths.push_back(out_axis_dim); + bool in_axis_dim_dynamic = false; + nvinfer1::ITensor* avg_len_tensor = nullptr; + // need infer output_lengths + if (num > 0 && output_lengths.empty()) { + if (input_dims.d[axis] > 0) { + int64_t in_axis_dim = input_dims.d[axis]; + size_t out_axis_dim = in_axis_dim / num; + for (int i = 0; i < num; ++i) { + output_lengths.push_back(out_axis_dim); + } + } else { + in_axis_dim_dynamic = true; + auto* num_tensor = Add1DConstantLayer(num); + avg_len_tensor = + Div(GetEleTensorOfShape(shape_tensor, axis), num_tensor); } } nvinfer1::ILayer* layer = nullptr; +#if IS_TRT_VERSION_GE(6000) + if (engine_->with_dynamic_shape()) { + nvinfer1::Dims trt_step_dims; + trt_step_dims.nbDims = input->getDimensions().nbDims; + for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1; + + std::vector gather_indices; + gather_indices.resize(trt_step_dims.nbDims); + std::iota(gather_indices.begin(), gather_indices.end(), 0); + gather_indices[axis] = gather_indices.size(); + std::vector zeros(trt_step_dims.nbDims, 0); + auto* zeros_tensor = Add1DConstantLayer(zeros); + // input : [N,C,H,W] + int start_point = 0; + for (size_t i = 0; i < output_num; i++) { + nvinfer1::ITensor* this_len_tensor = nullptr; + nvinfer1::ITensor* start_point_tensor = nullptr; + if (!in_axis_dim_dynamic) { + this_len_tensor = Add1DConstantLayer(output_lengths[i]); + start_point_tensor = Add1DConstantLayer(start_point); + start_point += output_lengths[i]; + } else { + this_len_tensor = avg_len_tensor; + auto* i_tensor = Add1DConstantLayer(i); + start_point_tensor = Prod(i_tensor, avg_len_tensor); + } + + std::vector concat_inputs1 = {zeros_tensor, + start_point_tensor}; + std::vector concat_inputs2 = {shape_tensor, + this_len_tensor}; + auto* start_tensor = Gather(Concat(concat_inputs1), gather_indices); + auto* size_tensor = Gather(Concat(concat_inputs2), gather_indices); + layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, trt_step_dims, + trt_step_dims, trt_step_dims); + layer->setInput(1, *start_tensor); + layer->setInput(2, *size_tensor); + + auto output_name = op_desc.Output("Out")[i]; + RreplenishLayerAndOutput(layer, "split", {output_name}, test_mode); + } + } else { + auto chw_input_dims = input->getDimensions(); + nvinfer1::Dims trt_start_dims; + trt_start_dims.nbDims = chw_input_dims.nbDims; + memset(trt_start_dims.d, 0, sizeof(int32_t) * chw_input_dims.nbDims); + nvinfer1::Dims trt_size_dims = chw_input_dims; + nvinfer1::Dims trt_step_dims; + trt_step_dims.nbDims = chw_input_dims.nbDims; + for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1; + + // input : [C,H,W] + for (size_t i = 0; i < output_num; i++) { + trt_start_dims.d[axis] = std::accumulate(output_lengths.begin(), + output_lengths.begin() + i, 0); + trt_size_dims.d[axis] = output_lengths[i]; + layer = TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, trt_start_dims, + trt_size_dims, trt_step_dims); + auto output_name = op_desc.Output("Out")[i]; + RreplenishLayerAndOutput(layer, "split", {output_name}, test_mode); + } + } +#else if (engine_->with_dynamic_shape()) { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPluginDynamic* plugin = new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16); - layer = engine_->AddDynamicPlugin(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, 1, plugin); } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths, with_fp16); - layer = engine_->AddPluginV2Ext(&input, input_num, plugin); + layer = engine_->AddPluginV2Ext(&input, 1, plugin); } - - std::string layer_name = "split (Output: "; + std::vector output_names; for (size_t i = 0; i < output_num; i++) { - auto output_name = op_desc.Output("Out")[i]; - layer->getOutput(i)->setName(output_name.c_str()); - engine_->SetITensor(output_name, layer->getOutput(i)); - layer_name += output_name; - if (test_mode) { - engine_->DeclareOutput(output_name); - } + output_names.push_back(op_desc.Output("Out")[i]); } - layer->setName((layer_name + ")").c_str()); + RreplenishLayerAndOutput(layer, "split", output_names, test_mode); +#endif } }; diff --git a/paddle/fluid/inference/tensorrt/convert/top_k_op.cc b/paddle/fluid/inference/tensorrt/convert/top_k_op.cc new file mode 100644 index 0000000000000..1d7f1ca243b2a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/top_k_op.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TopKOpConverter : public OpConverter { + public: + TopKOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + + auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]); + + const int k = op_desc.HasAttr("k") + ? BOOST_GET_CONST(int, op_desc.GetAttr("k")) + : 1.0f; + + nvinfer1::Dims input_dims = input_tensor->getDimensions(); + int axis = input_dims.nbDims; + nvinfer1::ITopKLayer* layer = + TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, + nvinfer1::TopKOperation::kMAX, k, 1 << (axis - 1)); + + std::vector output_names; + output_names.push_back(op_desc.Output("Out").front()); + output_names.push_back(op_desc.Output("Indices").front()); + + RreplenishLayerAndOutput(layer, "top_k", output_names, test_mode); + } +}; +class TopKv2OpConverter : public OpConverter { + public: + TopKv2OpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + + auto* input_tensor = engine_->GetITensor(op_desc.Input("X")[0]); + + const int k = op_desc.HasAttr("k") + ? BOOST_GET_CONST(int, op_desc.GetAttr("k")) + : 1.0f; + const int axis = op_desc.HasAttr("axis") + ? BOOST_GET_CONST(int, op_desc.GetAttr("axis")) + : 1.0f; + const bool largest = op_desc.HasAttr("largest") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("largest")) + : true; + auto flag = + largest ? nvinfer1::TopKOperation::kMAX : nvinfer1::TopKOperation::kMIN; + nvinfer1::ITopKLayer* layer = nullptr; + if (axis == -1) { + nvinfer1::Dims input_dims = input_tensor->getDimensions(); + layer = TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k, + 1 << (input_dims.nbDims - 1)); + } else { + if (engine_->with_dynamic_shape()) { + layer = TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k, + 1 << axis); + } else { + layer = TRT_ENGINE_ADD_LAYER(engine_, TopK, *input_tensor, flag, k, + 1 << (axis - 1)); + } + } + std::vector output_names; + output_names.push_back(op_desc.Output("Out").front()); + output_names.push_back(op_desc.Output("Indices").front()); + + RreplenishLayerAndOutput(layer, "top_k_v2", output_names, test_mode); + } +}; +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(top_k, TopKOpConverter); +REGISTER_TRT_OP_CONVERTER(top_k_v2, TopKv2OpConverter); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b28fe827156c3..0260c489b5041 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -686,7 +686,7 @@ class TensorRTEngine { // them, and an macro like this is more extensible when underlying TensorRT // library add new layer supports. #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \ - engine__->network()->add##layer__(__VA_ARGS__); + engine__->network()->add##layer__(__VA_ARGS__) class TRTEngineManager { public: diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 57ac400dadab3..d9b1e9b85f7e4 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -104,6 +104,8 @@ struct SimpleOpTypeSetTeller : public Teller { "stack", "transpose2", "transpose", + "top_k", + "top_k_v2", "flatten2", "flatten", "gather", @@ -175,6 +177,8 @@ struct SimpleOpTypeSetTeller : public Teller { "stack", "transpose2", "transpose", + "top_k", + "top_k_v2", "flatten2", "flatten", "gather", @@ -1037,15 +1041,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } - } else { - for (size_t i = 0; i < axes.size(); i++) { - if (starts[i] < 0 || ends[i] < 0) { - VLOG(3) << "Invalid slice attribute 'starts' or 'ends'. " - "Negative starts or ends not supported in TensorRT " - "when running in dynamic shape mode."; - return false; - } - } } } } @@ -1759,6 +1754,34 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "top_k_v2" || op_type == "top_k") { + auto* block = desc.Block(); + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() == 1) { + VLOG(3) << "top_k/top_k_v2 does not support 1-dimensional input in " + "tensorrt"; + return false; + } + if (desc.HasAttr("axis")) { + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis == 0) { + VLOG(3) << "top_k_v2 does not support axis == 0 in " + "tensorrt"; + return false; + } + } + if (desc.HasAttr("sorted")) { + bool sorted = BOOST_GET_CONST(bool, desc.GetAttr("sorted")); + if (!sorted) { + VLOG(3) << "top_k_v2 does not support results not sorted in " + "tensorrt"; + return false; + } + } + } + #if IS_TRT_VERSION_GE(8000) if (op_type == "sparse_fc" || op_type == "sparse_multihead_matmul") { if (!with_dynamic_shape) { diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 307af84fa367e..8261ce288cb97 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -799,7 +799,7 @@ if(WITH_MKLDNN) if(NOT LINUX) download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz") - endif(NOT LINUX) + endif() download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz") inference_analysis_api_quant_test_run( @@ -829,7 +829,7 @@ if(WITH_MKLDNN) download_quant_data_without_verify( ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE}) - endif(NOT LINUX) + endif() set(QUANT2_RESNET50_MODEL ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise) inference_analysis_api_quant_test_run( @@ -869,10 +869,8 @@ endif() set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") download_model_and_data_without_verify( ${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") -if(WITH_GPU) - inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} - analyzer_bert_tester.cc) -endif() +inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} + analyzer_bert_tester.cc) # multiple models prediction set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction") diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 0aee989367e4b..e1ef57e3a136e 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -22,8 +22,8 @@ macro(safe_set_static_flag) CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) if(${flag_var} MATCHES "/MD") string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") - endif(${flag_var} MATCHES "/MD") - endforeach(flag_var) + endif() + endforeach() endmacro() if(NOT DEFINED PADDLE_LIB) @@ -106,7 +106,7 @@ if(WITH_GPU) endif() endif() message(STATUS "Current CUDA lib path: ${CUDA_LIB}") - endif(NOT WIN32) + endif() endif() if(USE_TENSORRT AND WITH_GPU) @@ -182,9 +182,9 @@ if(WITH_MKL) include_directories("${MKLDNN_PATH}/include") if(WIN32) set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) - else(WIN32) + else() set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) - endif(WIN32) + endif() endif() else() set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") @@ -255,7 +255,7 @@ else() cryptopp-static ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) -endif(NOT WIN32) +endif() if(WITH_GPU) if(NOT WIN32) @@ -302,7 +302,7 @@ if(WITH_GTEST) ${DEMO_NAME} ${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest${CMAKE_STATIC_LIBRARY_SUFFIX} ) - endif(WIN32) + endif() endif() if(WIN32) if("${CMAKE_GENERATOR}" MATCHES "Ninja") diff --git a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake index 49b0a04197d12..b141b76c6f33b 100644 --- a/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake +++ b/paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake @@ -10,7 +10,8 @@ set(GTEST_REPOSITORY https://github.com/google/googletest.git) set(GTEST_TAG release-1.8.1) include_directories(${GTEST_INCLUDE_DIR}) if(WIN32) - # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is install/gtest/lib/gtest.lib but GTEST_LIBRARIES + # if use CMAKE_INSTALL_LIBDIR, the path of lib actually is \ + # install/gtest/lib/gtest.lib but GTEST_LIBRARIES # is install/gtest/gtest.lib set(GTEST_LIBRARIES "${GTEST_INSTALL_DIR}/lib/gtest.lib" @@ -25,7 +26,7 @@ else() set(GTEST_MAIN_LIBRARIES "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) -endif(WIN32) +endif() ExternalProject_Add( extern_gtest PREFIX gtest diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 1f72482eef777..5d1f97c096bdd 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -47,10 +47,8 @@ if(WITH_GPU) if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) set_tests_properties( stream_safe_cuda_alloc_test - PROPERTIES - ENVIRONMENT - "FLAGS_use_stream_safe_cuda_allocator=true;FLAGS_allocator_strategy=auto_growth" - ) + PROPERTIES ENVIRONMENT "FLAGS_use_stream_safe_cuda_allocator=true; \ + FLAGS_allocator_strategy=auto_growth") endif() endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 109afd06f4df1..e1b14c4bae875 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -261,4 +261,4 @@ if(NOT WIN32) SRCS cuda_ipc_allocator.cc DEPS allocator) endif() -endif(NOT WIN32) +endif() diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index eb0664eb17d35..d2d9ef1ab8fb6 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -1,6 +1,6 @@ include(operators) -# solve "math constants not defined" problems caused by the order of inclusion +# solve "math constants not defined" problems caused by the order of inclusion # of and the definition of macro _USE_MATH_DEFINES add_definitions(-D_USE_MATH_DEFINES) diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index 90d0a72074b81..1debfbf4af2a3 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -145,6 +145,26 @@ class SqrtGradMLUKernel : public framework::OpKernel { } }; +// CNNL_LOG_E = 0, +// CNNL_LOG_2 = 1, +// CNNL_LOG_10 = 2, +template +class LogMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_desc(*input); + MLUCnnlTensorDesc output_desc(*output); + cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION; + + MLUCnnl::Log(ctx, prefer, Log_base, input_desc.get(), GetBasePtr(input), + output_desc.get(), GetBasePtr(output)); + } +}; + } // namespace operators } // namespace paddle @@ -213,3 +233,16 @@ REGISTER_OP_MLU_KERNEL(sqrt, ops::SqrtMLUKernel, ops::SqrtMLUKernel); REGISTER_OP_MLU_KERNEL(sqrt_grad, ops::SqrtGradMLUKernel, ops::SqrtGradMLUKernel); + +// log log2 log10 +REGISTER_OP_MLU_KERNEL( + log, ops::LogMLUKernel, + ops::LogMLUKernel); + +REGISTER_OP_MLU_KERNEL( + log2, ops::LogMLUKernel, + ops::LogMLUKernel); + +REGISTER_OP_MLU_KERNEL( + log10, ops::LogMLUKernel, + ops::LogMLUKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc index 237cfcc6f1172..48ca1e22df72d 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -22,6 +23,8 @@ using Tensor = framework::Tensor; template class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const { auto& dev_ctx = ctx.template device_context(); @@ -51,6 +54,7 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { } MLUCnnlTensorDesc x_desc(*x); + MLUCnnlTensorDesc out_desc(*out); MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x), GetBasePtr(&is_finite)); @@ -70,10 +74,34 @@ class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { // out = in/scale, if found_inf = false // But when found_inf is true, the data of Out should not be used. // So, on MLU, we always compute out with in/scale. - MLUCnnlTensorDesc out_desc(*out); - MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(), - GetBasePtr(x), scale_desc.get(), GetBasePtr(scale), - out_desc.get(), GetBasePtr(out)); + Tensor float_x; + Tensor float_out; + if (std::is_same::value) { + float_x.Resize(x->dims()); + float_out.Resize(out->dims()); + float_x.mutable_data(ctx.GetPlace()); + float_out.mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc float_x_desc(float_x); + MLUCnnlTensorDesc float_out_desc(float_out); + auto cast_fp16_type = + GetCastDataType(DataType::FLOAT16, DataType::FLOAT32); + MLUCnnl::Cast(ctx, cast_fp16_type, x_desc.get(), GetBasePtr(x), + float_x_desc.get(), GetBasePtr(&float_x)); + + MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, float_x_desc.get(), + GetBasePtr(&float_x), scale_desc.get(), GetBasePtr(scale), + float_out_desc.get(), GetBasePtr(&float_out)); + + auto cast_fp32_type = + GetCastDataType(DataType::FLOAT32, DataType::FLOAT16); + MLUCnnl::Cast(ctx, cast_fp32_type, float_out_desc.get(), + GetBasePtr(&float_out), out_desc.get(), GetBasePtr(out)); + } else { + MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(), + GetBasePtr(x), scale_desc.get(), GetBasePtr(scale), + out_desc.get(), GetBasePtr(out)); + } } } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index baf742b0b404b..26fa1c9131627 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -107,9 +107,9 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling."); AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps."); AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps."); - AddOutput("StopUpdate", - "(Tensor) 1-dim tensor. Stop updating loss scaling, and just " - "zero inputs. It has higher priority than Attr(stop_update).") + AddInput("StopUpdate", + "(Tensor) 1-dim tensor. Stop updating loss scaling, and just " + "zero inputs. It has higher priority than Attr(stop_update).") .AsDispensable(); AddAttr("incr_every_n_steps", "A value represents increasing loss scaling every n " diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc index c1517dbe16f84..b1b39608d624d 100644 --- a/paddle/fluid/operators/conv_op_mlu.cc +++ b/paddle/fluid/operators/conv_op_mlu.cc @@ -238,6 +238,228 @@ class MLUConvGradOpKernel : public framework::OpKernel { } } }; + +template +class MLUDepthwiseConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + const std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + const std::string data_format = ctx.Attr("data_format"); + + const bool channel_last = data_format == "NHWC"; + int groups; + + // update padding and dilation + auto in_dims = input->dims(); + auto filter_dims = filter->dims(); + auto in_dims_size = in_dims.size(); + framework::DDim in_data_dims; + framework::DDim filter_data_dims; + + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + Tensor input_tensor(input->type()); + Tensor output_tensor(output->type()); + const std::vector perm_to_nhwc = {0, 2, 3, 1}; + if (channel_last) { + groups = in_dims[3]; + input_tensor.ShareDataWith(*input); + output_tensor.ShareDataWith(*output); + } else { + // transpose input from NCHW to NHWC + groups = in_dims[1]; + TransposeFromMLUTensor(ctx, perm_to_nhwc, input, &input_tensor, + true /*need_reshape_or_alloc*/); + auto output_dims = output->dims(); + output_tensor.mutable_data( + {output_dims[0], output_dims[2], output_dims[3], output_dims[1]}, + ctx.GetPlace()); + } + input_tensor.set_layout(DataLayout::kNHWC); + output_tensor.set_layout(DataLayout::kNHWC); + + // transpose filter from MCHW to MHWC + Tensor trans_filter(filter->type()); + TransposeFromMLUTensor(ctx, perm_to_nhwc, filter, &trans_filter, + true /*need_reshape_or_alloc*/); + + cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC; + MLUCnnlTensorDesc input_desc(input_tensor, data_layout, + ToCnnlDataType(input_tensor.dtype())); + MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, + ToCnnlDataType(trans_filter.type())); + MLUCnnlTensorDesc output_desc(output_tensor, data_layout, + ToCnnlDataType(output_tensor.dtype())); + + MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(), + strides.data(), dilations.data(), groups, + ToCnnlDataType()); + + MLUCnnl::ConvolutionForward( + ctx, conv_desc.get(), nullptr /*alpha*/, nullptr /*beta*/, + nullptr /*bias_desc*/, nullptr /*bias_ptr*/, input_desc.get(), + GetBasePtr(&input_tensor), filter_desc.get(), GetBasePtr(&trans_filter), + output_desc.get(), GetBasePtr(&output_tensor)); + + if (!channel_last) { + // transpose output from NHWC to NCHW + const std::vector perm_to_nchw = {0, 3, 1, 2}; + TransposeFromMLUTensor(ctx, perm_to_nchw, &output_tensor, output, + false /*need_reshape_or_alloc*/); + } + } +}; + +template +class MLUDepthwiseConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + const std::string data_format = ctx.Attr("data_format"); + + const bool channel_last = data_format == "NHWC"; + + // update padding and dilation + auto in_dims = input->dims(); + auto filter_dims = filter->dims(); + auto in_dims_size = in_dims.size(); + framework::DDim in_data_dims; + framework::DDim filter_data_dims; + int groups; + + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + Tensor input_tensor(input->type()); + Tensor output_grad_tensor(output_grad->type()); + const std::vector perm_to_nhwc = {0, 2, 3, 1}; + const std::vector perm_to_nchw = {0, 3, 1, 2}; + if (channel_last) { + input_tensor.ShareDataWith(*input); + output_grad_tensor.ShareDataWith(*output_grad); + groups = in_dims[3]; + } else { + groups = in_dims[1]; + // transpose input and output_grad from NCHW to NHWC + TransposeFromMLUTensor(ctx, perm_to_nhwc, input, &input_tensor, + true /*need_reshape_or_alloc*/); + TransposeFromMLUTensor(ctx, perm_to_nhwc, output_grad, + &output_grad_tensor, + true /*need_reshape_or_alloc*/); + } + input_tensor.set_layout(DataLayout::kNHWC); + output_grad_tensor.set_layout(DataLayout::kNHWC); + + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + + auto filter_grad_dims = filter_grad->dims(); + Tensor temp_filter_grad(filter_grad->type()); + temp_filter_grad.mutable_data( + {filter_grad_dims[0], filter_grad_dims[2], filter_grad_dims[3], + filter_grad_dims[1]}, + ctx.GetPlace()); + + cnnlDataType_t tensor_dtype = ToCnnlDataType(); + cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC; + MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype); + MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout, + tensor_dtype); + MLUCnnlTensorDesc temp_filter_grad_desc(temp_filter_grad, data_layout, + tensor_dtype); + + MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(), + strides.data(), dilations.data(), groups, + tensor_dtype); + + MLUCnnl::ConvBackpropFilter( + ctx, conv_desc.get(), input_desc.get(), GetBasePtr(&input_tensor), + out_grad_desc.get(), GetBasePtr(&output_grad_tensor), + temp_filter_grad_desc.get(), GetBasePtr(&temp_filter_grad)); + + // transpose filter_grad from MHWC to MCHW + TransposeFromMLUTensor(ctx, perm_to_nchw, &temp_filter_grad, + filter_grad, false /*need_reshape_or_alloc*/); + } + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + + Tensor input_grad_tensor(input_grad->type()); + if (channel_last) { + input_grad_tensor.ShareDataWith(*input_grad); + } else { + auto input_grad_dims = input_grad->dims(); + input_grad_tensor.mutable_data( + {input_grad_dims[0], input_grad_dims[2], input_grad_dims[3], + input_grad_dims[1]}, + ctx.GetPlace()); + } + input_grad_tensor.set_layout(DataLayout::kNHWC); + + // transpose filter from MCHW to MHWC + Tensor trans_filter(filter->type()); + TransposeFromMLUTensor(ctx, perm_to_nhwc, filter, &trans_filter, + true /*need_reshape_or_alloc*/); + + cnnlDataType_t tensor_dtype = ToCnnlDataType(); + cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC; + MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype); + MLUCnnlTensorDesc out_grad_desc(output_grad_tensor, data_layout, + tensor_dtype); + MLUCnnlTensorDesc in_grad_desc(input_grad_tensor, data_layout, + tensor_dtype); + + MLUCnnlConvolutionDesc conv_desc(in_dims_size, paddings.data(), + strides.data(), dilations.data(), groups, + tensor_dtype); + + MLUCnnl::ConvBackpropInput( + ctx, conv_desc.get(), filter_desc.get(), GetBasePtr(&trans_filter), + out_grad_desc.get(), GetBasePtr(&output_grad_tensor), + in_grad_desc.get(), GetBasePtr(&input_grad_tensor)); + + if (!channel_last) { + // transpose input_grad from NHWC to NCHW + TransposeFromMLUTensor(ctx, perm_to_nchw, &input_grad_tensor, + input_grad, false /*need_reshape_or_alloc*/); + } + } + } +}; } // namespace operators } // namespace paddle @@ -249,3 +471,10 @@ REGISTER_OP_MLU_KERNEL(conv2d, ops::MLUConvOpKernel, REGISTER_OP_MLU_KERNEL(conv2d_grad, ops::MLUConvGradOpKernel, ops::MLUConvGradOpKernel); + +REGISTER_OP_MLU_KERNEL(depthwise_conv2d, ops::MLUDepthwiseConvOpKernel, + ops::MLUDepthwiseConvOpKernel); + +REGISTER_OP_MLU_KERNEL(depthwise_conv2d_grad, + ops::MLUDepthwiseConvGradOpKernel, + ops::MLUDepthwiseConvGradOpKernel); diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc index 3f8bc8674186d..bbeacd0eb5ff0 100644 --- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc +++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc @@ -316,7 +316,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel { auto* boxes_input = ctx.Input("BBoxes"); auto* scores_input = ctx.Input("Scores"); auto* outs = ctx.Output("Out"); - auto score_dims = scores_input->dims(); + auto& score_dims = scores_input->dims(); auto score_size = score_dims.size(); auto& dev_ctx = ctx.template device_context(); diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc index 4e49a6ed8521e..a5d22149eca22 100644 --- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc +++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc @@ -471,7 +471,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel { std::vector box_per_batch_list(boxes_list.size()); std::vector score_per_batch_list(scores_list.size()); for (size_t j = 0; j < boxes_list.size(); ++j) { - auto score_dims = scores_list[j].dims(); + const auto& score_dims = scores_list[j].dims(); score_per_batch_list[j] = scores_list[j].Slice(i, i + 1); score_per_batch_list[j].Resize({score_dims[1], score_dims[2]}); box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1); diff --git a/paddle/fluid/operators/einsum_op.cc b/paddle/fluid/operators/einsum_op.cc index 7fc19d6913f83..95f841f7797b9 100644 --- a/paddle/fluid/operators/einsum_op.cc +++ b/paddle/fluid/operators/einsum_op.cc @@ -41,6 +41,10 @@ class EinsumOpMaker : public framework::OpProtoAndCheckerMaker { .AsExtra() .AsIntermediate(); + AddOutput("XShape", "(Tensor), The cache of the x_shape of: A and B.") + .AsDuplicable() + .AsExtra() + .AsIntermediate(); AddAttr("equation", "(string) A einsum equation. such as `ij,jk->ik`" "There must have `->` and the number of operands in " @@ -59,8 +63,8 @@ class EinsumGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { auto x_name = "Operands"; auto x_grad_name = framework::GradVarName(x_name); - ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim(x_name)); - ctx->ShareAllLoD(x_name, x_grad_name); + ctx->SetOutputsDim(x_grad_name, ctx->GetInputsDim("Operands")); + ctx->ShareAllLoD("Operands", x_grad_name); } protected: @@ -79,8 +83,15 @@ class EinsumGradMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr retv) const override { retv->SetType("einsum_grad"); - retv->SetInput("Operands", this->Input("Operands")); - retv->SetInput("InnerCache", this->Output("InnerCache")); + if (this->HasOutput("InnerCache")) { + retv->SetInput("InnerCache", this->Output("InnerCache")); + } + if (this->HasOutput("XShape")) { + // add if for compatibility. + retv->SetInput("Operands", this->Output("XShape")); // for memory save. + } else { + retv->SetInput("Operands", this->Input("Operands")); + } retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); retv->SetAttrMap(this->Attrs()); retv->SetOutput(framework::GradVarName("Operands"), diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc new file mode 100644 index 0000000000000..3ff93c49a3603 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_max_op_mlu.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_MLU + +#include "paddle/fluid/operators/elementwise/elementwise_mlu.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +namespace paddle { +namespace operators { + +template +class ElementwiseMaxMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + MLUBinaryOp(ctx); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_MLU_KERNEL(elementwise_max, ops::ElementwiseMaxMLUKernel, + ops::ElementwiseMaxMLUKernel, + ops::ElementwiseMaxMLUKernel); +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h index 8c230c5f47bf6..a6a153c34d47f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mlu.h +++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h @@ -108,6 +108,7 @@ void MLUOpTensorKernel(const framework::ExecutionContext& ctx, enum BINARY_FUNCTOR { DIV, DIVNONAN, + MAXIMUM, }; template @@ -126,6 +127,16 @@ inline void MLUBinary
(const framework::ExecutionContext& ctx, MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out); } +template <> +inline void MLUBinary( + const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, // useless, only for compatible + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t out_desc, void* out) { + MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out); +} + template void MLUBinaryOp(const framework::ExecutionContext& ctx) { auto* x = ctx.Input("X"); diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index 814827d95b6bd..01c5b79fff115 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -16,29 +16,25 @@ limitations under the License. */ // https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu // We add License in the head. -// headers sort by clang-format may cause compiling error or test faiure, -// see https://github.com/PaddlePaddle/Paddle/pull/42840/ -// clang-format off #include #include + #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -#include "paddle/phi/kernels/funcs/math_function.h" - #include "paddle/fluid/operators/fused/attention_layer_norm.h" #include "paddle/fluid/operators/fused/attn_gemm.h" #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif -// clang-format on namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index 18caed22b4855..f90bffe9df836 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -38,7 +38,8 @@ inline std::vector get_new_shape( "The shape of dimension tensor should be [1]," "but received d%.", tensor->dims())); - if (platform::is_gpu_place(tensor->place())) { + if (platform::is_gpu_place(tensor->place()) || + platform::is_mlu_place(tensor->place())) { framework::Tensor temp; paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_new_shape.push_back(static_cast(*temp.data())); @@ -55,7 +56,8 @@ inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { std::vector vec_new_data; auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; - if (platform::is_gpu_place(new_data_tensor->place())) { + if (platform::is_gpu_place(new_data_tensor->place()) || + platform::is_mlu_place(new_data_tensor->place())) { paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); diff --git a/paddle/fluid/operators/interpolate_v2_op_mlu.cc b/paddle/fluid/operators/interpolate_v2_op_mlu.cc new file mode 100644 index 0000000000000..9977337a395c6 --- /dev/null +++ b/paddle/fluid/operators/interpolate_v2_op_mlu.cc @@ -0,0 +1,488 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/interpolate_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/utils.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using DataLayout = framework::DataLayout; + +inline std::vector get_new_shape_mlu( + const std::vector& list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + PADDLE_ENFORCE_EQ( + tensor->dims(), phi::make_ddim({1}), + platform::errors::InvalidArgument("shape of dim tensor should be [1]")); + framework::Tensor temp; + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_new_shape.push_back(static_cast(*temp.data())); + } + + return vec_new_shape; +} + +template +class InterpolateV2MLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + + auto input_dims = input->dims(); + PADDLE_ENFORCE_GE( + input_dims.size(), 4, + platform::errors::External("MLU Interpolate kernel supports input " + "range greater or equal than 4.")); + PADDLE_ENFORCE_LE( + input_dims.size(), 5, + platform::errors::External("MLU Interpolate kernel supports input " + "range less or equal than 5. ")); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + int align_center = align_corners ? 0 : (align_mode == 1 ? 0 : 1); + + int out_d = ctx.Attr("out_d"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale_d = -1; + float scale_h = -1; + float scale_w = -1; + + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape_mlu(list_new_size_tensor); + if (new_size.size() <= 2) { + // default NCHW + out_h = new_size[0]; + out_w = new_size[1]; + } else { + // rank of input is 5, HCDHW + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + } else { + auto scale_tensor = ctx.Input("Scale"); + auto scale = ctx.Attr>("scale"); + if (scale_tensor != nullptr) { + std::vector scale_data; + scale_data = GetDataFromTensor(scale_tensor); + + if (scale_data.size() > 1 && scale_data.size() <= 2) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else if (scale_data.size() > 2) { + scale_d = scale_data[0]; + scale_h = scale_data[1]; + scale_w = scale_data[2]; + } else { + scale_d = scale_data[0]; + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0 && scale_h > 0, true, + platform::errors::InvalidArgument("scale of Op(interpolate) " + "should be greater than 0.")); + } else { + if (scale.size() > 1 && scale.size() <= 2) { + scale_h = scale[0]; + scale_w = scale[1]; + + PADDLE_ENFORCE_EQ( + scale_w > 0 && scale_h > 0, true, + platform::errors::InvalidArgument("scale of Op(interpolate) " + "should be greater than 0.")); + } else if (scale.size() > 2) { + scale_d = scale[0]; + scale_h = scale[1]; + scale_w = scale[2]; + PADDLE_ENFORCE_EQ( + scale_d > 0 && scale_w > 0 && scale_h > 0, true, + platform::errors::InvalidArgument("scale of Op(interpolate) " + "should be greater than 0.")); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + + if (scale_d > 0.) { + out_d = static_cast(in_d * scale_d); + } + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + std::vector out_size_data; + out_size_data = GetDataFromTensor(out_size); + if (out_size_data.size() <= 2) { + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } else { + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; + } + } + } + PADDLE_ENFORCE_GT( + out_h, 0, + platform::errors::InvalidArgument("out_h in Attr(out_shape) of " + "Op(interpolate) " + "should be greater than 0.")); + PADDLE_ENFORCE_GT( + out_w, 0, + platform::errors::InvalidArgument("out_w in Attr(out_shape) of " + "Op(interpolate) " + "should be greater than 0.")); + + // do transpose according to cnnl's constraints + // cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and + // CNNL_INTERP_NEAREST, + framework::DDim dim_in, dim_in_trans, dim_out, dim_out_trans; + Tensor transformed_input, transformed_output; + bool need_transpose = input_dims.size() != 2; + if (input_dims.size() == 4) { + // need to do transpose if layout is kNCHW + need_transpose &= data_layout == DataLayout::kNCHW; + if (need_transpose) { + // if need_transpose, do the following + // 1. transpose input NCHW -> NHWC + // 2. interpolation in(NHWC) -> out(NHWC) + // 3. transpose output NHWC -> HCHW + // dim_in = {n, c, in_h, in_w}; + dim_in_trans = {n, in_h, in_w, c}; + dim_out = {n, c, out_h, out_w}; + dim_out_trans = {n, out_h, out_w, c}; + output->mutable_data(dim_out, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + // do transpose on input tensor, then do interpolation + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_NCHW, + ToCnnlDataType(input->dtype())); + + transformed_input = + ctx.AllocateTmpTensor(dim_in_trans, dev_ctx); + transformed_output = + ctx.AllocateTmpTensor(dim_out_trans, dev_ctx); + + MLUCnnlTensorDesc input_reshaped_desc( + transformed_input, CNNL_LAYOUT_NHWC, + ToCnnlDataType(transformed_input.dtype())); + const std::vector perm = {0, 2, 3, 1}; + MLUCnnl::Transpose(ctx, perm, input_dims.size(), input_desc.get(), + GetBasePtr(input), input_reshaped_desc.get(), + GetBasePtr(&transformed_input)); + } else { + // if no need_transpose, do the following + // 1. interpolation in(NHWC) -> out(NHWC) + // dim_in = {n, in_h, in_w, c}; + dim_out = {n, out_h, out_w, c}; + output->mutable_data(dim_out, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + transformed_input = *input; + transformed_output = *output; + } + + MLUCnnlTensorDesc input_desc(transformed_input, CNNL_LAYOUT_NHWC, + ToCnnlDataType(transformed_input.dtype())); + MLUCnnlTensorDesc output_desc(transformed_output, CNNL_LAYOUT_NHWC, + ToCnnlDataType(transformed_output.dtype())); + MLUCnnl::Interp(ctx, GetMLUCnnlInterpMode(interp_method), align_corners, + align_center, input_desc.get(), + GetBasePtr(&transformed_input), output_desc.get(), + GetBasePtr(&transformed_output)); + + if (need_transpose) { + // if need_transpose, reshape output back to NCHW + const std::vector perm = {0, 3, 1, 2}; + MLUCnnlTensorDesc output_reshape_desc(*output, CNNL_LAYOUT_NCHW, + ToCnnlDataType(output->dtype())); + MLUCnnl::Transpose(ctx, perm, dim_out_trans.size(), output_desc.get(), + GetBasePtr(&transformed_output), + output_reshape_desc.get(), GetBasePtr(output)); + } + } else { + PADDLE_ENFORCE_EQ( + interp_method, "trilinear", + platform::errors::External("MLU Interpolate kernel only supports 5D " + "data in trilinear mode.")); + + // need to do transpose if layout is kNCDHW + need_transpose &= data_layout == DataLayout::kNCHW; + if (need_transpose) { + // if need_transpose, do the following + // 1. transpose input NCDHW -> NDHWC + // 2. interpolation in(NDHWC) -> out(NDHWC) + // 3. transpose output NDHWC -> HCDHW + // dim_in = {n, c, in_d, in_h, in_w}; + dim_in_trans = {n, in_d, in_h, in_w, c}; + dim_out = {n, c, out_d, out_h, out_w}; + dim_out_trans = {n, out_d, out_h, out_w, c}; + output->mutable_data(dim_out, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w && in_d == out_d) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + // do transpose on input tensor (HCDHW -> NDHWC), then do interpolation + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_NCDHW, + ToCnnlDataType(input->dtype())); + + transformed_input = + ctx.AllocateTmpTensor(dim_in_trans, dev_ctx); + transformed_output = + ctx.AllocateTmpTensor(dim_out_trans, dev_ctx); + + MLUCnnlTensorDesc input_reshaped_desc( + transformed_input, CNNL_LAYOUT_NDHWC, + ToCnnlDataType(transformed_input.dtype())); + const std::vector perm = {0, 2, 3, 4, 1}; + MLUCnnl::Transpose(ctx, perm, input_dims.size(), input_desc.get(), + GetBasePtr(input), input_reshaped_desc.get(), + GetBasePtr(&transformed_input)); + } else { + // if no need_transpose, do the following + // 1. interpolation in(NDHWC) -> out(NDHWC) + // dim_in = {n, in_d, in_h, in_w, c}; + dim_out = {n, out_d, out_h, out_w, c}; + output->mutable_data(dim_out, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w && in_d == out_d) { + framework::TensorCopy(*input, ctx.GetPlace(), output); + return; + } + transformed_input = *input; + transformed_output = *output; + } + + MLUCnnlTensorDesc input_desc(transformed_input, CNNL_LAYOUT_NDHWC, + ToCnnlDataType(transformed_input.dtype())); + MLUCnnlTensorDesc output_desc(transformed_output, CNNL_LAYOUT_NDHWC, + ToCnnlDataType(transformed_output.dtype())); + // use trilinear mode in HCDHW layout + MLUCnnl::Interp(ctx, GetMLUCnnlInterpMode(interp_method), align_corners, + align_center, input_desc.get(), + GetBasePtr(&transformed_input), output_desc.get(), + GetBasePtr(&transformed_output)); + + if (need_transpose) { + // if need_transpose, reshape output back (NDHWC -> NCDHW) + const std::vector perm = {0, 4, 1, 2, 3}; + MLUCnnlTensorDesc output_reshape_desc(*output, CNNL_LAYOUT_NCDHW, + ToCnnlDataType(output->dtype())); + MLUCnnl::Transpose(ctx, perm, dim_out_trans.size(), output_desc.get(), + GetBasePtr(&transformed_output), + output_reshape_desc.get(), GetBasePtr(output)); + } + } + } +}; + +template +class InterpolateV2GradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + + auto output_grad_dims = output_grad->dims(); + + PADDLE_ENFORCE_EQ(output_grad_dims.size(), 4, + platform::errors::External( + "XPU Interpolategrad kernel only support 2d")); + + auto* input = ctx.Input("X"); + auto input_dims = input->dims(); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + int align_center = align_corners ? 0 : (align_mode == 0 ? 0 : 1); + align_center = 0; + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale_h = -1; + float scale_w = -1; + + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape_mlu(list_new_size_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + auto scale_tensor = ctx.Input("Scale"); + auto scale = ctx.Attr>("scale"); + if (scale_tensor != nullptr) { + std::vector scale_data; + scale_data = GetDataFromTensor(scale_tensor); + if (scale_data.size() > 1) { + scale_h = scale_data[0]; + scale_w = scale_data[1]; + } else { + scale_h = scale_data[0]; + scale_w = scale_data[0]; + } + PADDLE_ENFORCE_EQ( + scale_w > 0 && scale_h > 0, true, + platform::errors::InvalidArgument("scale of Op(interpolate) " + "should be greater than 0.")); + } else { + if (scale.size() > 1) { + scale_h = scale[0]; + scale_w = scale[1]; + + PADDLE_ENFORCE_EQ( + scale_w > 0 && scale_h > 0, true, + platform::errors::InvalidArgument("scale of Op(interpolate) " + "should be greater than 0.")); + } + } + if (scale_h > 0. && scale_w > 0.) { + out_h = static_cast(in_h * scale_h); + out_w = static_cast(in_w * scale_w); + } + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + std::vector out_size_data; + out_size_data = GetDataFromTensor(out_size); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + } + + framework::DDim dim_grad; + framework::DDim dim_out_grad, dim_out_trans_grad, dim_in_grad, + dim_in_trans_grad; + Tensor transformed_output_grad, transformed_input_grad; + bool need_transpose = + input_dims.size() != 2 && data_layout == DataLayout::kNCHW; + + if (need_transpose) { + // if need_transpose, do the following + // 1. transpose output_grad NCHW -> NHWC + // 2. InterpBackward output_grad(NHWC) -> input_grad(NHWC) + // 3. transpose input_grad NHWC -> HCHW + // dim_out_grad = {n, c, out_h, out_w}; + dim_out_trans_grad = {n, out_h, out_w, c}; + dim_in_grad = {n, c, in_h, in_w}; + dim_in_trans_grad = {n, in_h, in_w, c}; + input_grad->mutable_data(dim_in_grad, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); + return; + } + // do transpose on input tensor, then do interpolation + MLUCnnlTensorDesc input_desc(*output_grad, CNNL_LAYOUT_NCHW, + ToCnnlDataType(output_grad->dtype())); + + transformed_output_grad = ctx.AllocateTmpTensor( + dim_out_trans_grad, dev_ctx); + transformed_input_grad = ctx.AllocateTmpTensor( + dim_in_trans_grad, dev_ctx); + + MLUCnnlTensorDesc input_reshaped_desc( + transformed_output_grad, CNNL_LAYOUT_NHWC, + ToCnnlDataType(transformed_output_grad.dtype())); + const std::vector perm = {0, 2, 3, 1}; + MLUCnnl::Transpose(ctx, perm, input_dims.size(), input_desc.get(), + GetBasePtr(output_grad), input_reshaped_desc.get(), + GetBasePtr(&transformed_output_grad)); + } else { + // if no need_transpose, do the following + // 1. InterpBackward output_grad(NHWC) -> input_grad(NHWC) + dim_in_grad = {n, in_h, in_w, c}; + input_grad->mutable_data(dim_in_grad, ctx.GetPlace()); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); + return; + } + transformed_output_grad = *output_grad; + transformed_input_grad = *input_grad; + } + + MLUCnnlTensorDesc input_desc( + transformed_output_grad, CNNL_LAYOUT_NHWC, + ToCnnlDataType(transformed_output_grad.dtype())); + MLUCnnlTensorDesc output_desc( + transformed_input_grad, CNNL_LAYOUT_NHWC, + ToCnnlDataType(transformed_input_grad.dtype())); + MLUCnnl::InterpBackward( + ctx, GetMLUCnnlInterpBackwardMode(interp_method), align_corners, + align_center, input_desc.get(), GetBasePtr(&transformed_output_grad), + output_desc.get(), GetBasePtr(&transformed_input_grad)); + + if (need_transpose) { + const std::vector perm = {0, 3, 1, 2}; + MLUCnnlTensorDesc output_reshape_desc( + *input_grad, CNNL_LAYOUT_NCHW, ToCnnlDataType(input_grad->dtype())); + MLUCnnl::Transpose(ctx, perm, dim_in_trans_grad.size(), output_desc.get(), + GetBasePtr(&transformed_input_grad), + output_reshape_desc.get(), GetBasePtr(input_grad)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(bilinear_interp_v2, ops::InterpolateV2MLUKernel, + ops::InterpolateV2MLUKernel); +REGISTER_OP_MLU_KERNEL(nearest_interp_v2, ops::InterpolateV2MLUKernel, + ops::InterpolateV2MLUKernel); + +REGISTER_OP_MLU_KERNEL(nearest_interp_v2_grad, + ops::InterpolateV2GradMLUKernel, + ops::InterpolateV2GradMLUKernel); +REGISTER_OP_MLU_KERNEL(bilinear_interp_v2_grad, + ops::InterpolateV2GradMLUKernel, + ops::InterpolateV2GradMLUKernel); diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc index a368af86a3da6..919358febd2eb 100644 --- a/paddle/fluid/operators/layer_norm_op_mlu.cc +++ b/paddle/fluid/operators/layer_norm_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -122,6 +123,8 @@ class LayerNormMLUKernel : public framework::OpKernel { template class LayerNormGradMLUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); @@ -207,14 +210,14 @@ class LayerNormGradMLUKernel : public framework::OpKernel { if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 && dscale->dtype() == DataType::FLOAT32)) { - dscale->mutable_data(place); + dscale->mutable_data(place); MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(), GetBasePtr(&tmp_dscale), float32_desc.get(), GetBasePtr(dscale)); } if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 && dbias->dtype() == DataType::FLOAT32)) { - dbias->mutable_data(place); + dbias->mutable_data(place); MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(), GetBasePtr(&tmp_dbias), float32_desc.get(), GetBasePtr(dbias)); diff --git a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc index c8ab269c023a5..b69a52c761d4a 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_mlu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_mlu.cc @@ -18,7 +18,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -constexpr int64_t kNoPadding = -1; template class LookupTableV2MLUKernel : public framework::OpKernel { @@ -27,6 +26,7 @@ class LookupTableV2MLUKernel : public framework::OpKernel { auto *ids_t = ctx.Input("Ids"); // int tensor auto *output_t = ctx.Output("Out"); // float tensor auto *table_t = ctx.Input("W"); + int padding_idx = static_cast(ctx.Attr("padding_idx")); auto *table_var = ctx.InputVar("W"); PADDLE_ENFORCE_EQ( @@ -38,43 +38,10 @@ class LookupTableV2MLUKernel : public framework::OpKernel { MLUCnnlTensorDesc table_desc(*table_t); MLUCnnlTensorDesc output_desc(*output_t); - int64_t padding_idx = ctx.Attr("padding_idx"); - if (padding_idx == kNoPadding) { - MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0, - table_desc.get(), GetBasePtr(table_t), - ids_desc.get(), GetBasePtr(ids_t), - output_desc.get(), GetBasePtr(output_t)); - } else { - Tensor tmp_table_t(table_t->type()); - tmp_table_t.mutable_data(table_t->dims(), ctx.GetPlace()); - - Tensor index; - index.mutable_data({1, 1}, ctx.GetPlace()); - auto idx_value = static_cast(padding_idx); - MLUCnnlTensorDesc index_desc(index); - MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &idx_value, index_desc.get(), - GetBasePtr(&index)); - - auto update_dim = phi::make_ddim({1, table_t->dims()[1]}); - Tensor update; - update.mutable_data(update_dim, ctx.GetPlace()); - - auto update_value = static_cast(0); - MLUCnnlTensorDesc update_desc(update); - MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &update_value, - update_desc.get(), GetBasePtr(&update)); - - MLUCnnlTensorDesc tmp_table_desc(tmp_table_t); - MLUCnnl::ScatterNd( - ctx, CNNL_SCATTERND_UPDATE, index_desc.get(), GetBasePtr(&index), - update_desc.get(), GetBasePtr(&update), table_desc.get(), - GetBasePtr(table_t), tmp_table_desc.get(), GetBasePtr(&tmp_table_t)); - - MLUCnnl::GatherFunctor(ctx, /*axis=*/0, /*batch_dims=*/0, - tmp_table_desc.get(), GetBasePtr(&tmp_table_t), - ids_desc.get(), GetBasePtr(ids_t), - output_desc.get(), GetBasePtr(output_t)); - } + MLUCnnl::EmbeddingForward(ctx, padding_idx, table_desc.get(), + GetBasePtr(table_t), ids_desc.get(), + static_cast(GetBasePtr(ids_t)), + output_desc.get(), GetBasePtr(output_t)); } }; @@ -82,6 +49,16 @@ template class LookupTableV2GradMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + auto *table_var = ctx.InputVar("W"); + PADDLE_ENFORCE_EQ(table_var->IsType(), true, + platform::errors::PermissionDenied( + "Unsupported Variable Type , idx in " + "LookupTableV2GradMLUKernel should be LoDTensor.")); + bool is_sparse = ctx.Attr("is_sparse"); + PADDLE_ENFORCE_EQ( + is_sparse, false, + platform::errors::InvalidArgument( + "LookupTableV2GradMLUKernel dose NOT support is_sparse = True.")); auto *ids_t = ctx.Input("Ids"); auto *output_grad_t = ctx.Input(framework::GradVarName("Out")); @@ -91,6 +68,13 @@ class LookupTableV2GradMLUKernel : public framework::OpKernel { int padding_idx = static_cast(ctx.Attr("padding_idx")); + int64_t ids_numel = ids_t->numel(); + PADDLE_ENFORCE_EQ( + ids_numel <= std::numeric_limits::max(), true, + platform::errors::OutOfRange( + "Number of ids greater than int32_t::max , please check " + "number of ids in LookupTableV2GradMLUKernel.")); + Tensor ids_int32(ids_t->dtype()); if (ids_t->dtype() != DataType::INT32) { ids_int32.mutable_data(ids_t->dims(), ctx.GetPlace()); @@ -125,5 +109,4 @@ REGISTER_OP_MLU_KERNEL(lookup_table_v2, ops::LookupTableV2MLUKernel, REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad, ops::LookupTableV2GradMLUKernel, - ops::LookupTableV2GradMLUKernel, ops::LookupTableV2GradMLUKernel); diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 33da631d27b14..bb3797d268291 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -262,7 +262,7 @@ class SoftmaxFunctor> { public: void operator()(const DeviceContext& context, const int axis_dim, const framework::Tensor* X, framework::Tensor* Y) { - auto in_dims = X->dims(); + const auto& in_dims = X->dims(); const float* in_data = X->data(); float* out_data = Y->data(); const int kBatchDim = 0; @@ -387,7 +387,7 @@ class SoftmaxGradFunctor> { void operator()(const DeviceContext& context, const int axis_dim, const framework::Tensor* y, const framework::Tensor* y_grad, framework::Tensor* x_grad) { - auto out_dims = y->dims(); + const auto& out_dims = y->dims(); constexpr int kBatchDim = 0; constexpr int kClassDim = 1; const int num_classes = out_dims[kClassDim]; diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index dc8301b9e0b8d..d5b843d47afb7 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -901,14 +901,11 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { cnnlAddN(handle, inputs_desc, inputs, input_num, output_desc, output)); } -/* static */ void MLUCnnl::Log(const ExecutionContext& ctx, - cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, - const void* input, - const cnnlTensorDescriptor_t output_desc, - void* output) { +/* static */ void MLUCnnl::Log( + const ExecutionContext& ctx, cnnlComputationPreference_t prefer, + cnnlLogBase_t log_base, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output) { cnnlHandle_t handle = GetHandleFromCTX(ctx); - cnnlLogBase_t log_base = CNNL_LOG_E; PADDLE_ENFORCE_MLU_SUCCESS(cnnlLog_v2(handle, prefer, log_base, input_desc, input, output_desc, output)); @@ -1925,9 +1922,9 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { const cnnlTensorDescriptor_t output_desc, void* output) { cnnlHandle_t handle = GetHandleFromCTX(ctx); - PADDLE_ENFORCE_MLU_SUCCESS( - cnnlInterpBackward(handle, align_corners, half_pixel_centers, mode, - input_desc, input, output_desc, output)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlInterpBackward_v2( + handle, align_corners, half_pixel_centers, mode, NULL, true, input_desc, + input, output_desc, output)); } /* static */ void MLUCnnl::Cast(const ExecutionContext& ctx, @@ -2802,6 +2799,18 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { cnnlReciprocal(handle, input_desc, input, output_desc, output)); } +/* static */ void MLUCnnl::EmbeddingForward( + const ExecutionContext& ctx, const int padding_idx, + const cnnlTensorDescriptor_t weight_desc, const void* weight, + const cnnlTensorDescriptor_t indices_desc, const int* indices, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlEmbeddingForward_v2( + handle, weight_desc, weight, indices_desc, indices, padding_idx, + nullptr /*max_norm*/, nullptr /*norm_type*/, output_desc, output)); +} + /* static */ void MLUCnnl::EmbeddingBackward( const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq, const cnnlTensorDescriptor_t indices_desc, const void* indices, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 774e297c06dd0..71648c5c5fbca 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -41,6 +41,20 @@ const std::map MLUReduceOpMap = { {"reduce_prod", CNNL_REDUCE_MUL}, }; +const std::map MLUInterpModeMap = { + {"bilinear", CNNL_INTERP_BILINEAR}, + {"nearest", CNNL_INTERP_NEAREST}, + {"linear", CNNL_INTERP_LINEAR}, + {"trilinear", CNNL_INTERP_TRILINEAR}, + {"bicubic", CNNL_INTERP_BICUBIC}}; + +const std::map MLUInterpBackwardModeMap = + {{"bilinear", CNNL_INTERP_BACKWARD_BILINEAR}, + {"nearest", CNNL_INTERP_BACKWARD_NEAREST}, + {"linear", CNNL_INTERP_BACKWARD_LINEAR}, + {"trilinear", CNNL_INTERP_BACKWARD_TRILINEAR}, + {"bicubic", CNNL_INTERP_BACKWARD_BICUBIC}}; + inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) { auto iter = MLUReduceOpMap.find(reduce_name); if (iter != MLUReduceOpMap.end()) { @@ -50,6 +64,25 @@ inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) { "Not support reduce op type of MLU Device: %s", reduce_name)); } +inline cnnlInterpMode_t GetMLUCnnlInterpMode(const std::string interp_mode) { + auto iter = MLUInterpModeMap.find(interp_mode); + if (iter != MLUInterpModeMap.end()) { + return iter->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support interp mode of MLU Device: %s", interp_mode)); +} + +inline cnnlInterpBackwardMode_t GetMLUCnnlInterpBackwardMode( + const std::string interp_mode) { + auto iter = MLUInterpBackwardModeMap.find(interp_mode); + if (iter != MLUInterpBackwardModeMap.end()) { + return iter->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support interp mode of MLU Device: %s", interp_mode)); +} + inline const void* GetBasePtr(const Tensor* t) { return t->data(); } inline void* GetBasePtr(Tensor* t) { return t->data(); } @@ -633,7 +666,7 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); static void Log(const ExecutionContext& ctx, - cnnlComputationPreference_t prefer, + cnnlComputationPreference_t prefer, cnnlLogBase_t log_base, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); @@ -1235,6 +1268,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); + static void EmbeddingForward( + const ExecutionContext& ctx, const int padding_idx, + const cnnlTensorDescriptor_t weight_desc, const void* weight, + const cnnlTensorDescriptor_t indices_desc, const int* indices, + const cnnlTensorDescriptor_t output_desc, void* output); + static void EmbeddingBackward( const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq, const cnnlTensorDescriptor_t indices_desc, const void* indices, diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc index 9d335021234eb..36d0fb491a975 100644 --- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc @@ -237,8 +237,8 @@ class AdamWMLUKernel : public AdamMLUKernel { ctx.device_context(), &skip_update_vec); skip_update = skip_update_vec[0]; } - VLOG(3) << "Skip update" << skip_update; bool with_decay = ctx.Attr("with_decay"); + VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay; if (!skip_update && with_decay) { if (ctx.HasInput("MasterParam")) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 30ead84d1a987..9aa68881e44a0 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -35,27 +35,8 @@ bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { auto src_tz = phi::vectorize(ctx.Input("X")->dims()); std::vector ksize = ctx.Attr>("ksize"); // Fast but not exhustive check - if ((src_tz[src_tz.size() - 1] % ksize[1] == 0) && - (src_tz[src_tz.size() - 2] % ksize[0] == 0)) - return true; - - // Exhustive check - auto IH = static_cast(src_tz[src_tz.size() - 2]); - auto IW = static_cast(src_tz[src_tz.size() - 1]); - auto OH = static_cast(ksize[0]); - auto OW = static_cast(ksize[1]); - - auto SH = static_cast(floor((IH * 2.0) / OH) - floor(IH / OH)); - auto SW = static_cast(floor((IW * 2.0) / OW) - floor(IW / OW)); - auto KH = static_cast(ceil((IH * 2.0) / OH) - floor(IH / OH)); - auto KW = static_cast(ceil((IW * 2.0) / OW) - floor(IW / OW)); - - auto PH = (SH * (static_cast(OH) - 1) + KH - static_cast(IH)); - auto PW = (SW * (static_cast(OW) - 1) + KW - static_cast(IW)); - // If there is additional padding needed then - // this is situation that oneDNN cannot comply with - // paddlepaddle reference implementation - return (PH == 0) && (PW == 0); + return ((src_tz[src_tz.size() - 1] % ksize[1] == 0) && + (src_tz[src_tz.size() - 2] % ksize[0] == 0)); } framework::OpKernelType PoolOp::GetExpectedKernelType( diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index bfd33efe833d2..42e8379bca4af 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -257,7 +257,12 @@ class RunProgramOpKernel : public framework::OpKernel { auto input_var_names = ctx.InputNames("X"); auto output_var_names = ctx.OutputNames("Out"); - auto dout_var_names = ctx.OutputNames("DOut"); + std::vector dout_var_names; + if (!dout_vars.empty()) { + // DOut is a dispensable out, only get the names when it exists. + // Otherwise, it will throw a NotFound error. + dout_var_names = ctx.OutputNames("DOut"); + } // current program may not hold parameters std::vector param_names; @@ -272,10 +277,23 @@ class RunProgramOpKernel : public framework::OpKernel { // NOTE(chenweihang): In order not to add new variable type, use vector // here. Originally, here can use scope directly. auto *out_scope_vec = ctx.Output("OutScope"); - PADDLE_ENFORCE_EQ( - out_scope_vec->size(), 1, - platform::errors::InvalidArgument( - "The OutScope of RunProgramGradOp should only hold one scope.")); + std::unique_ptr inner_scope{nullptr}; + if (out_scope_vec->size() == 0) { + // For cuda graph under static mode usage. + // For static mode, we cannot set value of a tensor before any run, + // the OutScope variable passed to the op actually contains nothing. + // Just create a tmp scope to run the program. + PADDLE_ENFORCE_EQ( + use_cuda_graph, true, + platform::errors::InvalidArgument( + "If not provide OutScope then must run under cuda graph mode.")); + inner_scope = std::make_unique(); + } else { + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + } // Step 2. prepare executor and init persistable variables @@ -284,9 +302,10 @@ class RunProgramOpKernel : public framework::OpKernel { // Learning. Tensor data in multi-step training should be saved into single // scope separately. Otherwise, the gradients can be miscalculated because // always using the Tensor data of the last step in forward. - framework::Scope *global_inner_scope = out_scope_vec->front(); + framework::Scope *global_inner_scope = + out_scope_vec->size() == 0 ? inner_scope.get() : out_scope_vec->front(); VLOG(2) << "The number of sub scopes before forward: " - << out_scope_vec->front()->kids().size(); + << global_inner_scope->kids().size(); framework::Scope &scope = global_inner_scope->NewScope(); // share input_vars & parameters into scope @@ -341,13 +360,19 @@ class RunProgramOpKernel : public framework::OpKernel { &scope); // Debug info: scope info when run end - VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + framework::Scope *target_scope{nullptr}; + if (out_scope_vec->size() == 0) { + target_scope = inner_scope.get(); + } else { + target_scope = out_scope_vec->front(); + } + VLOG(3) << framework::GenScopeTreeDebugInfo(target_scope); // Step 5. Drop all children scopes while testing. if (is_test) { - out_scope_vec->front()->DropKids(); + target_scope->DropKids(); } VLOG(2) << "The number of sub scopes after forward: " - << out_scope_vec->front()->kids().size(); + << target_scope->kids().size(); #ifdef PADDLE_WITH_MKLDNN if (FLAGS_use_mkldnn) platform::DontClearMKLDNNCache(ctx.GetPlace()); #endif diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index 409acdfdff7ba..06abd0628ea39 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -27,7 +27,7 @@ class ShuffleChannelOpKernel : public framework::OpKernel { auto* output = ctx.Output("Out"); int group = ctx.Attr("group"); - auto input_dims = input->dims(); + const auto& input_dims = input->dims(); auto num = input_dims[0]; auto channel = input_dims[1]; auto height = input_dims[2]; diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index a815e12d061cf..edc72f4125eb6 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -104,7 +104,11 @@ class SliceOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "The size of ends must be equal to the size of axes.")); } - + for (auto &axis : axes) { + if (axis < 0) { + axis = std::max(0, axis + in_dims.size()); + } + } phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, nullptr, &infer_flags); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 7304467833a90..d6287f4c766ef 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -57,15 +57,16 @@ class SoftmaxOp : public framework::OperatorWithKernel { } #endif -#ifndef PADDLE_WITH_ASCEND_CL if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) || - platform::is_xpu_place(ctx.GetPlace()), - true, - platform::errors::InvalidArgument( - "float16 can only be used on GPU/XPU place")); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()) || + platform::is_npu_place(ctx.GetPlace()) || + platform::is_xpu_place(ctx.GetPlace()) || + platform::is_mlu_place(ctx.GetPlace()), + true, + platform::errors::InvalidArgument( + "float16 can only be used on GPU/NPU/XPU/MLU place")); } -#endif return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); @@ -174,9 +175,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP16) { if (!(platform::is_gpu_place(ctx.GetPlace()) || platform::is_npu_place(ctx.GetPlace()) || - platform::is_xpu_place(ctx.GetPlace()))) + platform::is_xpu_place(ctx.GetPlace()) || + platform::is_mlu_place(ctx.GetPlace()))) PADDLE_THROW(platform::errors::InvalidArgument( - "float16 can only be used on GPU/NPU/XPU place")); + "float16 can only be used on GPU/NPU/XPU/MLU place")); } return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index 4ef2a9709a59d..9d3d342431b78 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -28,6 +28,16 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + // After PR(#43206), cudnn related initializations will change to lazy mode. + // It will only be initialized when op calls them. But cuda graph not support + // capture such kind of init, need to init all these handle before cuda graph. + dev_ctx->cublas_handle(); +#if CUDA_VERSION >= 11060 + dev_ctx->cublaslt_handle(); +#endif + dev_ctx->cudnn_handle(); + dev_ctx->cusolver_dn_handle(); + auto stream = dev_ctx->stream(); CUDAGraph::BeginCapture(place, stream, mode); diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index 82363fcff6349..65f5e81238bc8 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -92,6 +92,26 @@ std::unique_ptr DeserializationReader::Parse() { device_node); // insert into runtime_node } } + // handle mem node + for (int mem_node_index = 0; + mem_node_index < host_node_proto.mem_nodes_size(); + mem_node_index++) { + const MemTraceEventNodeProto& mem_node_proto = + host_node_proto.mem_nodes(mem_node_index); + MemTraceEventNode* mem_node = RestoreMemTraceEventNode(mem_node_proto); + host_node->AddMemNode(mem_node); + } + // handle op supplement node + for (int op_supplement_node_index = 0; + op_supplement_node_index < + host_node_proto.op_supplement_nodes_size(); + op_supplement_node_index++) { + const OperatorSupplementEventNodeProto& op_supplement_node_proto = + host_node_proto.op_supplement_nodes(op_supplement_node_index); + OperatorSupplementEventNode* op_supplement_node = + RestoreOperatorSupplementEventNode(op_supplement_node_proto); + host_node->SetOperatorSupplementNode(op_supplement_node); + } } // restore parent-child relationship for (auto it = child_parent_map.begin(); it != child_parent_map.end(); @@ -176,6 +196,62 @@ HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode( return new HostTraceEventNode(host_event); } +MemTraceEventNode* DeserializationReader::RestoreMemTraceEventNode( + const MemTraceEventNodeProto& mem_node_proto) { + const MemTraceEventProto& mem_event_proto = mem_node_proto.mem_event(); + MemTraceEvent mem_event; + mem_event.timestamp_ns = mem_event_proto.timestamp_ns(); + mem_event.addr = mem_event_proto.addr(); + mem_event.type = static_cast(mem_event_proto.type()); + mem_event.process_id = mem_event_proto.process_id(); + mem_event.thread_id = mem_event_proto.thread_id(); + mem_event.increase_bytes = mem_event_proto.increase_bytes(); + mem_event.place = mem_event_proto.place(); + mem_event.current_allocated = mem_event_proto.current_allocated(); + mem_event.current_reserved = mem_event_proto.current_reserved(); + return new MemTraceEventNode(mem_event); +} + +OperatorSupplementEventNode* +DeserializationReader::RestoreOperatorSupplementEventNode( + const OperatorSupplementEventNodeProto& op_supplement_node_proto) { + const OperatorSupplementEventProto& op_supplement_event_proto = + op_supplement_node_proto.op_supplement_event(); + OperatorSupplementEvent op_supplement_event; + op_supplement_event.timestamp_ns = op_supplement_event_proto.timestamp_ns(); + op_supplement_event.op_type = op_supplement_event_proto.op_type(); + op_supplement_event.callstack = op_supplement_event_proto.callstack(); + op_supplement_event.process_id = op_supplement_event_proto.process_id(); + op_supplement_event.thread_id = op_supplement_event_proto.thread_id(); + std::map>> input_shapes; + std::map> dtypes; + auto input_shape_proto = op_supplement_event_proto.input_shapes(); + for (int i = 0; i < input_shape_proto.key_size(); i++) { + auto input_shape_vec = input_shapes[input_shape_proto.key(i)]; + auto shape_vectors_proto = input_shape_proto.shape_vecs(i); + for (int j = 0; j < shape_vectors_proto.shapes_size(); j++) { + auto shape_vector_proto = shape_vectors_proto.shapes(j); + std::vector shape; + for (int k = 0; k < shape_vector_proto.size_size(); k++) { + shape.push_back(shape_vector_proto.size(k)); + } + input_shape_vec.push_back(shape); + } + } + op_supplement_event.input_shapes = input_shapes; + auto dtype_proto = op_supplement_event_proto.dtypes(); + for (int i = 0; i < dtype_proto.key_size(); i++) { + auto dtype_vec = dtypes[dtype_proto.key(i)]; + auto dtype_vec_proto = dtype_proto.dtype_vecs(i); + for (int j = 0; j < dtype_vec_proto.dtype_size(); j++) { + auto dtype_string = dtype_vec_proto.dtype(j); + dtype_vec.push_back(dtype_string); + } + } + op_supplement_event.dtypes = dtypes; + return new OperatorSupplementEventNode(op_supplement_event); +} + KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( const DeviceTraceEventProto& device_event_proto) { const KernelEventInfoProto& kernel_info_proto = diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index e6feb4f9489e8..7df93b7703c32 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -36,6 +36,9 @@ class DeserializationReader { KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&); MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&); MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&); + MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); + OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( + const OperatorSupplementEventNodeProto&); std::string filename_; std::ifstream input_file_stream_; NodeTreesProto* node_trees_proto_; diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index 7016745059d40..0f0c9c92c9c93 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -46,6 +46,15 @@ enum TracerEventTypeProto { PythonOp = 13; // Used to mark python level userdefined PythonUserDefined = 14; + // Used to mark mlu runtime record returned by cnpapi + MluRuntime = 15; +}; + +enum TracerMemEventTypeProto { + // Used to mark memory allocation + Allocate = 0; + // Used to mark memory free + Free = 1; }; message KernelEventInfoProto { @@ -121,6 +130,58 @@ message HostTraceEventProto { required uint64 thread_id = 6; } +message MemTraceEventProto { + // timestamp of the record + required uint64 timestamp_ns = 1; + // memory manipulation type + required TracerMemEventTypeProto type = 2; + // memory addr of allocation or free + required uint64 addr = 3; + // process id of the record + required uint64 process_id = 4; + // thread id of the record + required uint64 thread_id = 5; + // increase bytes after this manipulation, allocation for sign +, free for + // sign - + required int64 increase_bytes = 6; + // place + required string place = 7; + // current total allocated memory + required uint64 current_allocated = 8; + // current total reserved memory + required uint64 current_reserved = 9; +} + +message OperatorSupplementEventProto { + // timestamp of the record + required uint64 timestamp_ns = 1; + // op type name + required string op_type = 2; + // process id of the record + required uint64 process_id = 3; + // thread id of the record + required uint64 thread_id = 4; + // input shapes + message input_shape_proto { + repeated string key = 1; + message shape_vector { + message shape { repeated uint64 size = 1; } + repeated shape shapes = 1; + } + repeated shape_vector shape_vecs = 2; + } + required input_shape_proto input_shapes = 5; + // dtypes + message dtype_proto { + repeated string key = 1; + message dtype_vector { repeated string dtype = 1; } + repeated dtype_vector dtype_vecs = 2; + } + required dtype_proto dtypes = 6; + // call stack + required string callstack = 7; +} + message CudaRuntimeTraceEventProto { // record name required string name = 1; @@ -166,6 +227,12 @@ message DeviceTraceEventProto { } } +message OperatorSupplementEventNodeProto { + required OperatorSupplementEventProto op_supplement_event = 1; +} + +message MemTraceEventNodeProto { required MemTraceEventProto mem_event = 1; } + message DeviceTraceEventNodeProto { required DeviceTraceEventProto device_event = 1; } @@ -180,6 +247,9 @@ message HostTraceEventNodeProto { required int64 parentid = 2; required HostTraceEventProto host_trace_event = 3; repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4; + // below is added in version 1.0.1 + repeated MemTraceEventNodeProto mem_nodes = 5; + repeated OperatorSupplementEventNodeProto op_supplement_nodes = 6; } message ThreadNodeTreeProto { diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index b8afe2af0e776..eaf1353168ea4 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -20,7 +20,7 @@ namespace paddle { namespace platform { static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; -static const char* version = "1.0.0"; +static const char* version = "1.0.1"; static uint32_t span_indx = 0; static std::string DefaultFileName() { @@ -106,10 +106,33 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { (*devicenode)->LogMe(this); // fill detail information } } + for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin(); + memnode != (*hostnode)->GetMemTraceEventNodes().end(); ++memnode) { + MemTraceEventNodeProto* mem_node_proto = + current_host_trace_event_node_proto_->add_mem_nodes(); + current_mem_trace_event_node_proto_ = mem_node_proto; + (*memnode)->LogMe(this); + } } } } +void SerializationLogger::LogMemTraceEventNode( + const MemTraceEventNode& mem_node) { + MemTraceEventProto* mem_trace_event = new MemTraceEventProto(); + mem_trace_event->set_timestamp_ns(mem_node.TimeStampNs()); + mem_trace_event->set_type( + static_cast(mem_node.Type())); + mem_trace_event->set_addr(mem_node.Addr()); + mem_trace_event->set_process_id(mem_node.ProcessId()); + mem_trace_event->set_thread_id(mem_node.ThreadId()); + mem_trace_event->set_increase_bytes(mem_node.IncreaseBytes()); + mem_trace_event->set_place(mem_node.Place()); + mem_trace_event->set_current_allocated(mem_node.CurrentAllocated()); + mem_trace_event->set_current_reserved(mem_node.CurrentReserved()); + current_mem_trace_event_node_proto_->set_allocated_mem_event(mem_trace_event); +} + void SerializationLogger::LogHostTraceEventNode( const HostTraceEventNode& host_node) { HostTraceEventProto* host_trace_event = new HostTraceEventProto(); @@ -122,6 +145,59 @@ void SerializationLogger::LogHostTraceEventNode( host_trace_event->set_thread_id(host_node.ThreadId()); current_host_trace_event_node_proto_->set_allocated_host_trace_event( host_trace_event); + OperatorSupplementEventNode* op_supplement_event_node = + host_node.GetOperatorSupplementEventNode(); + if (op_supplement_event_node != nullptr) { + current_op_supplement_event_node_proto_ = + current_host_trace_event_node_proto_->add_op_supplement_nodes(); + OperatorSupplementEventProto* op_supplement_event_proto = + new OperatorSupplementEventProto(); + op_supplement_event_proto->set_op_type(op_supplement_event_node->Name()); + op_supplement_event_proto->set_timestamp_ns( + op_supplement_event_node->TimeStampNs()); + op_supplement_event_proto->set_process_id( + op_supplement_event_node->ProcessId()); + op_supplement_event_proto->set_thread_id( + op_supplement_event_node->ThreadId()); + op_supplement_event_proto->set_callstack( + op_supplement_event_node->CallStack()); + + OperatorSupplementEventProto::input_shape_proto* input_shape_proto = + op_supplement_event_proto->mutable_input_shapes(); + for (auto it = op_supplement_event_node->InputShapes().begin(); + it != op_supplement_event_node->InputShapes().end(); it++) { + input_shape_proto->add_key(it->first); + OperatorSupplementEventProto::input_shape_proto::shape_vector* + shape_vectors_proto = input_shape_proto->add_shape_vecs(); + auto shape_vectors = it->second; + for (auto shape_vecs_it = shape_vectors.begin(); + shape_vecs_it != shape_vectors.end(); shape_vecs_it++) { + auto shape_vector = *shape_vecs_it; + OperatorSupplementEventProto::input_shape_proto::shape_vector::shape* + shape_proto = shape_vectors_proto->add_shapes(); + for (auto shape_it = shape_vector.begin(); + shape_it != shape_vector.end(); shape_it++) { + shape_proto->add_size(*shape_it); + } + } + } + + OperatorSupplementEventProto::dtype_proto* dtype_proto = + op_supplement_event_proto->mutable_dtypes(); + for (auto it = op_supplement_event_node->Dtypes().begin(); + it != op_supplement_event_node->Dtypes().end(); it++) { + dtype_proto->add_key(it->first); + OperatorSupplementEventProto::dtype_proto::dtype_vector* + dtype_vector_proto = dtype_proto->add_dtype_vecs(); + auto dtype_vector = it->second; + for (auto dtype_it = dtype_vector.begin(); dtype_it != dtype_vector.end(); + dtype_it++) { + dtype_vector_proto->add_dtype(*dtype_it); + } + } + current_op_supplement_event_node_proto_->set_allocated_op_supplement_event( + op_supplement_event_proto); + } } void SerializationLogger::LogRuntimeTraceEventNode( diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 378834cff590d..31910cb68c5d7 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -34,6 +34,7 @@ class SerializationLogger : public BaseLogger { void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; void LogMetaInfo(const std::unordered_map); + void LogMemTraceEventNode(const MemTraceEventNode&) override; private: void OpenFile(); @@ -48,6 +49,8 @@ class SerializationLogger : public BaseLogger { HostTraceEventNodeProto* current_host_trace_event_node_proto_; CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_; DeviceTraceEventNodeProto* current_device_trace_event_node_proto_; + MemTraceEventNodeProto* current_mem_trace_event_node_proto_; + OperatorSupplementEventNodeProto* current_op_supplement_event_node_proto_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index 002071de0d1ef..dc6a6bf32d6e3 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -34,6 +34,7 @@ using paddle::platform::ProfilerResult; using paddle::platform::RuntimeTraceEvent; using paddle::platform::SerializationLogger; using paddle::platform::TracerEventType; +using paddle::platform::TracerMemEventType; TEST(SerializationLoggerTest, dump_case0) { std::list host_events; @@ -50,6 +51,19 @@ TEST(SerializationLoggerTest, dump_case0) { std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); host_events.push_back(HostTraceEvent( std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); + mem_events.push_back(MemTraceEvent(11500, 0x1000, + TracerMemEventType::Allocate, 10, 10, 50, + "GPU:0", 50, 50)); + mem_events.push_back(MemTraceEvent(11900, 0x1000, TracerMemEventType::Free, + 10, 10, -50, "GPU:0", 0, 50)); + std::map>> input_shapes; + std::map> dtypes; + input_shapes[std::string("X")].push_back(std::vector{1, 2, 3}); + input_shapes[std::string("X")].push_back(std::vector{4, 5, 6, 7}); + dtypes[std::string("X")].push_back(std::string("int8")); + dtypes[std::string("X")].push_back(std::string("float32")); + op_supplement_events.push_back(OperatorSupplementEvent( + 11600, "op1", input_shapes, dtypes, "op1()", 10, 10)); runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0)); runtime_events.push_back(RuntimeTraceEvent(std::string("cudalaunch2"), 25000, @@ -91,6 +105,8 @@ TEST(SerializationLoggerTest, dump_case0) { if ((*it)->Name() == "op1") { EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u); + EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr); } } for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { @@ -100,6 +116,7 @@ TEST(SerializationLoggerTest, dump_case0) { } } tree.LogMe(&logger); + logger.LogMetaInfo(std::unordered_map()); } TEST(SerializationLoggerTest, dump_case1) { @@ -154,6 +171,7 @@ TEST(SerializationLoggerTest, dump_case1) { } } tree.LogMe(&logger); + logger.LogMetaInfo(std::unordered_map()); } TEST(DeserializationReaderTest, restore_case0) { @@ -173,6 +191,8 @@ TEST(DeserializationReaderTest, restore_case0) { if ((*it)->Name() == "op1") { EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u); + EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr); } } for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index abde62c6b1444..4e40e87bbbf20 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -32,6 +32,9 @@ HostPythonNode::~HostPythonNode() { for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) { delete *it; } + for (auto it = mem_node_ptrs.begin(); it != mem_node_ptrs.end(); ++it) { + delete *it; + } } HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { @@ -77,6 +80,29 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { runtime_python_node->device_node_ptrs.push_back(device_python_node); } } + // copy MemTraceEventNode + for (auto memnode = root->GetMemTraceEventNodes().begin(); + memnode != root->GetMemTraceEventNodes().end(); memnode++) { + MemPythonNode* mem_python_node = new MemPythonNode(); + mem_python_node->timestamp_ns = (*memnode)->TimeStampNs(); + mem_python_node->addr = (*memnode)->Addr(); + mem_python_node->type = (*memnode)->Type(); + mem_python_node->process_id = (*memnode)->ProcessId(); + mem_python_node->thread_id = (*memnode)->ThreadId(); + mem_python_node->increase_bytes = (*memnode)->IncreaseBytes(); + mem_python_node->place = (*memnode)->Place(); + mem_python_node->current_allocated = (*memnode)->CurrentAllocated(); + mem_python_node->current_reserved = (*memnode)->CurrentReserved(); + host_python_node->mem_node_ptrs.push_back(mem_python_node); + } + // copy OperatorSupplementEventNode's information if exists + OperatorSupplementEventNode* op_supplement_node = + root->GetOperatorSupplementEventNode(); + if (op_supplement_node != nullptr) { + host_python_node->input_shapes = op_supplement_node->InputShapes(); + host_python_node->dtypes = op_supplement_node->Dtypes(); + host_python_node->callstack = op_supplement_node->CallStack(); + } return host_python_node; } diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index 172116dbb0edd..4d1f5ad4f788e 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -43,6 +43,31 @@ struct DevicePythonNode { uint64_t stream_id; }; +struct MemPythonNode { + MemPythonNode() = default; + ~MemPythonNode() {} + + // timestamp of the record + uint64_t timestamp_ns; + // memory addr of allocation or free + uint64_t addr; + // memory manipulation type + TracerMemEventType type; + // process id of the record + uint64_t process_id; + // thread id of the record + uint64_t thread_id; + // increase bytes after this manipulation, allocation for sign +, free for + // sign - + int64_t increase_bytes; + // place + std::string place; + // current total allocated memory + uint64_t current_allocated; + // current total reserved memory + uint64_t current_reserved; +}; + struct HostPythonNode { HostPythonNode() = default; ~HostPythonNode(); @@ -58,12 +83,19 @@ struct HostPythonNode { uint64_t process_id; // thread id of the record uint64_t thread_id; + // input shapes + std::map>> input_shapes; + std::map> dtypes; + // call stack + std::string callstack; // children node std::vector children_node_ptrs; // runtime node std::vector runtime_node_ptrs; // device node std::vector device_node_ptrs; + // mem node + std::vector mem_node_ptrs; }; class ProfilerResult { diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index c75ac0b52c52c..311ad7b48ed7b 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -384,7 +384,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, require_any_grad || egr::EagerUtils::ComputeRequireGrad( trace_backward, &(ins_auto_grad_metas[i])); } - if (require_any_grad) { + if (require_any_grad && (vec_map.size() > 1)) { VLOG(6) << " Construct Grad for Custom Op: " << op_type; ConstructFwdAndBwdMap(vec_map, op_type); for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index 2f4bbd5df352c..a58c6cc5b86ef 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off #include "paddle/infrt/api/infrt_api.h" #include @@ -31,6 +30,7 @@ #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" +#include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" @@ -51,18 +51,14 @@ #include "paddle/infrt/kernel/test_kernels.h" #include "paddle/infrt/tensor/tensor_map.h" -#include "paddle/infrt/dialect/infrt/pass/infrt_weights_unfold_pass.h" - #if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) -#include "paddle/infrt/kernel/tensorrt/registry.h" - #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" +#include "paddle/infrt/kernel/tensorrt/registry.h" #endif -// clang-format on using namespace infrt::host_context; // NOLINT using namespace infrt::tensor; // NOLINT diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h index 8dec818a80a27..eebcbbbcbc698 100644 --- a/paddle/infrt/dialect/dense_tensor.h +++ b/paddle/infrt/dialect/dense_tensor.h @@ -14,17 +14,14 @@ #pragma once -// clang-format off #include #include #include #include -#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" - #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #define GET_OP_CLASSES #include "paddle/infrt/dialect/dense_tensor.hpp.inc" -// clang-format on diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h index f7358db5bf356..9e09cdde502b7 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h @@ -14,7 +14,6 @@ #pragma once -// clang-format off #include #include #include @@ -30,12 +29,10 @@ #include #include +#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc" - -#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" // NOLINT #define GET_OP_CLASSES #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc" -// clang-format on diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc index 24af0ea437875..530d0981f1e8e 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" + #include #include + #include -#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/phi/kernels/declarations.h" -// clang-format on namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 899e71f1c990f..d1ce1c1b562df 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off #include #include @@ -27,18 +26,15 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_type_convert_pass.h" - #include "paddle/infrt/host_context/core_runtime.h" #include "paddle/infrt/host_context/kernel_registry.h" #include "paddle/infrt/host_context/mlir_to_runtime_translate.h" - #include "paddle/infrt/kernel/basic_kernels.h" #include "paddle/infrt/kernel/control_flow_kernels.h" #include "paddle/infrt/kernel/tensor_kernels.h" #include "paddle/infrt/kernel/tensor_shape_kernels.h" -#include "paddle/infrt/kernel/test_kernels.h" - #include "paddle/infrt/kernel/tensorrt/registry.h" +#include "paddle/infrt/kernel/test_kernels.h" #ifdef INFRT_WITH_PHI #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" @@ -48,7 +44,6 @@ #endif #include -// clang-format on int main(int argc, char** argv) { static llvm::cl::opt input_file( diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index 161fbbbcc65a5..8e39fea4cd8ec 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -12,21 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off #include "paddle/infrt/dialect/tensorrt/trt_ops.h" + #include #include #include #include #include #include -#include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" +#include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" -// clang-format on +#include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/external_kernels/CMakeLists.txt b/paddle/infrt/external_kernels/CMakeLists.txt index 96cfe2b73d8cd..9b318872e7551 100644 --- a/paddle/infrt/external_kernels/CMakeLists.txt +++ b/paddle/infrt/external_kernels/CMakeLists.txt @@ -10,6 +10,6 @@ message(STATUS "external_kernels_lib: ${external_kernels_lib}") add_test( NAME run_and_check_external_kernels COMMAND - sh -c - "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} --shared_libs=${external_kernels_lib} | ${LLVM_PATH}/bin/FileCheck ${basic_mlir}" -) + sh -c "${CMAKE_BINARY_DIR}/infrt/host_context/infrtexec -i ${basic_mlir} \ + --shared_libs=${external_kernels_lib} | \ + ${LLVM_PATH}/bin/FileCheck ${basic_mlir}") diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index 0ea68f2e835f7..931fe21b2c710 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -// clang-format off #include "paddle/infrt/kernel/tensorrt/trt_kernels.h" + #include #include + #include "NvInfer.h" #include "NvInferRuntime.h" #include "NvInferRuntimeCommon.h" @@ -27,17 +28,14 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" - -#include "paddle/infrt/kernel/tensorrt/trt_helper.h" -#include "paddle/infrt/kernel/tensorrt/trt_layers.h" - #include "paddle/infrt/backends/tensorrt/trt_engine.h" #include "paddle/infrt/backends/tensorrt/trt_options.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" #include "paddle/infrt/host_context/symbol_table.h" +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" +#include "paddle/infrt/kernel/tensorrt/trt_layers.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" -// clang-format on namespace infrt { namespace kernel { diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index bc41a24c44562..072ab6fd68a1a 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -402,7 +402,8 @@ void EighInferMeta(const MetaTensor& x, void EinsumInferMeta(const std::vector& inputs, const std::string& equation, MetaTensor* out, - std::vector inner_cache) { + std::vector inner_cache, + std::vector xshape) { // collect the following informations to prepare einsum. LabelMap labelshape(0); LabelMap labeltype(LabelType::Reduction); @@ -439,6 +440,12 @@ void EinsumInferMeta(const std::vector& inputs, VLOG(3) << "Label Shape is : " << label_to_string(all_labels, labelshape); out->set_dims(make_ddim(output_dims)); out->set_dtype(inputs[0]->dtype()); + for (size_t i = 0; i < xshape.size(); ++i) { + if (xshape[i] != nullptr) { + xshape[i]->set_dims(inputs[i]->dims()); + xshape[i]->set_dtype(inputs[i]->dtype()); + } + } } void ExpandInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a0cad3e628e3f..f64d406e019ce 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -83,7 +83,8 @@ void EighInferMeta(const MetaTensor& x, void EinsumInferMeta(const std::vector& inputs, const std::string& equation, MetaTensor* out, - std::vector inner_cache); + std::vector inner_cache, + std::vector xshape); void ExpandInferMeta(const MetaTensor& x, const IntArray& shape, diff --git a/paddle/phi/kernels/einsum_kernel.h b/paddle/phi/kernels/einsum_kernel.h index 87df2b1c64a4a..569cf7a55afd4 100644 --- a/paddle/phi/kernels/einsum_kernel.h +++ b/paddle/phi/kernels/einsum_kernel.h @@ -29,6 +29,7 @@ void EinsumKernelRaw(const Context& dev_ctx, const std::vector& inputs, const std::string& equation, DenseTensor* out, - std::vector cache); + std::vector inner_cache, + std::vector xshape); } // namespace phi diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h index a72db326807f8..a04185a0c53ed 100644 --- a/paddle/phi/kernels/impl/einsum_grad_impl.h +++ b/paddle/phi/kernels/impl/einsum_grad_impl.h @@ -177,7 +177,6 @@ void EinsumGradKernel(const Context& dev_ctx, cache[0].ShareBufferWith(*(inner_cache[0])); cache[1].ShareBufferWith(*(inner_cache[1])); } - EinsumKernelImpl(dev_ctx, all_labels, operands_for_A, diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h index f3521c81ce46b..43b2760b404f9 100644 --- a/paddle/phi/kernels/impl/einsum_impl.h +++ b/paddle/phi/kernels/impl/einsum_impl.h @@ -459,7 +459,7 @@ DenseTensor PerformContraction( } // reduction DenseTensor trans_t; - if (FLAGS_einsum_opt && use_cache && cache[operand_idx] != nullptr && + if (use_cache && cache[operand_idx] != nullptr && cache[operand_idx]->IsInitialized()) { trans_t.ShareBufferWith(*(cache[operand_idx])); VLOG(5) << "Cache Used!"; @@ -468,7 +468,7 @@ DenseTensor PerformContraction( dev_ctx, t, perm, all_labels, ellipsis, label2type); trans_t = PerformTranspose( dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type); - if (FLAGS_einsum_opt && cache[operand_idx] != nullptr) + if (cache[operand_idx] != nullptr) cache[operand_idx]->ShareBufferWith(trans_t); } auto mul_dims = GetShapeByType(all_labels, @@ -599,6 +599,11 @@ void EinsumKernelImpl(const Context& dev_ctx, out); // Reshape Procedure } else if (inputs.size() == 1) { + if (cache[0] != nullptr) { // For compatibility, may be cache is nullptr if + // loading the program from v2.3.0 + (*cache[0]) = *(inputs[0]); // ShareBuffer for backward, because backward + // we can only see cached tensor. + } auto reduce_A = PerformReduction(dev_ctx, *inputs[0], label2perms[0], @@ -627,7 +632,8 @@ void EinsumKernelRaw(const Context& dev_ctx, const std::vector& inputs, const std::string& equation, DenseTensor* out, - std::vector cache) { + std::vector cache, + std::vector xshape) { std::vector tmp; // for the sake of compatibility, we may load and run v2.3 EinsumOp. Output // may have nullptr and the cache.size() is not equal to inputs.size(). refer diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 73935640e349b..4b4a75727a55c 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -442,8 +442,14 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx, // (5) dx = dout * ddy if (ddout) { auto& place = *dev_ctx.eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx.get_ptr()->numel()) { + // size(ddout) > size(ddx) or we don't have ddx, ddout can't use memory of + // ddx using inplace + + bool without_ddx = (ddx.get_ptr() == nullptr); + if (!without_ddx) { + without_ddx = (ddout->numel() > ddx.get_ptr()->numel()); + } + if (without_ddx) { phi::funcs::ElemwiseGradCompute, MulGradDY>( dev_ctx, ddx_safe, diff --git a/paddle/phi/kernels/impl/lerp_kernel_impl.h b/paddle/phi/kernels/impl/lerp_kernel_impl.h index 58759308fac41..72fa0672a5f48 100644 --- a/paddle/phi/kernels/impl/lerp_kernel_impl.h +++ b/paddle/phi/kernels/impl/lerp_kernel_impl.h @@ -28,7 +28,7 @@ static void LerpFunction(const Context& ctx, DenseTensor* out) { ctx.template Alloc(out); - auto out_dims = out->dims(); + const auto& out_dims = out->dims(); auto x_dims = phi::funcs::ExtendDims2Rank(x.dims(), D); auto y_dims = phi::funcs::ExtendDims2Rank(y.dims(), D); auto w_dims = phi::funcs::ExtendDims2Rank(weight.dims(), D); diff --git a/paddle/phi/ops/compat/einsum_sig.cc b/paddle/phi/ops/compat/einsum_sig.cc index 5e45bcf97ce0e..4fd31c1a2d842 100644 --- a/paddle/phi/ops/compat/einsum_sig.cc +++ b/paddle/phi/ops/compat/einsum_sig.cc @@ -18,7 +18,7 @@ namespace phi { KernelSignature EinsumOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature( - "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache"}); + "einsum", {"Operands"}, {"equation"}, {"Out", "InnerCache", "XShape"}); } KernelSignature EinsumGradOpArgumentMapping(const ArgumentMappingContext& ctx) { diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 2a18d2f7e0195..f538193782179 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -680,7 +680,12 @@ pip install requests set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^ %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^ -%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH% +%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;^ +%THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib;%THIRD_PARTY_PATH:/=\%\install\paddle2onnx\lib;^ +%work_dir%\%BUILD_DIR%\paddle\fluid\inference;%PATH% + +REM TODO: make ut find .dll in install\onnxruntime\lib +xcopy %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib\onnxruntime.dll %work_dir%\%BUILD_DIR%\paddle\fluid\inference\tests\api\ /Y if "%WITH_GPU%"=="ON" ( call:parallel_test_base_gpu diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 3ed5f992ed40c..9e4aac55f5d2d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -955,6 +955,9 @@ function fetch_upstream_develop_if_not_exist() { } function check_whl_size() { + if [ ${BRANCH} != 'develop' ];then + return + fi set +x pr_whl_size=`du -m ${PADDLE_ROOT}/build/pr_whl/*.whl|awk '{print $1}'` @@ -1094,6 +1097,10 @@ function check_approvals_of_unittest() { fi fi elif [ $check_times == 3 ]; then + if [ ${BRANCH} != 'develop' ];then + return + fi + rm -f fluidInference_so_size curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/fluidInference_so_size oriBuildSize=`cat fluidInference_so_size` @@ -3276,6 +3283,10 @@ function build_develop() { } function check_coverage_build() { + if [ ${BRANCH} != 'develop' ];then + return + fi + rm -f build_size curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/build_size curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py index dca32fb6bb85b..5c9c8740d85bb 100644 --- a/python/paddle/device/cuda/graphs.py +++ b/python/paddle/device/cuda/graphs.py @@ -14,7 +14,10 @@ import os import paddle +from paddle.fluid import core +from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace +import warnings if is_compiled_with_cuda() and not is_compiled_with_rocm(): from paddle.fluid.core import CUDAGraph as CoreCUDAGraph @@ -106,3 +109,335 @@ def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"): else: mock_func._cuda_graph_pool_id = memory_pool._cuda_graph_pool_id return new_function + + +def copy_var_desc(dst, src): + """ + copy var desc from src to dst + + :param dst: framework.VarDesc(cpp), dst var desc, cpp VarDesc instance + :param src: framework.VarDesc(cpp), src var desc, cpp VarDesc instance + :return: no return + """ + dst.set_shape(src.shape) + dst.set_dtype(src.dtype) + dst.set_lod_level(src.lod_level) + dst.set_type(src.type) + dst.set_persistable(src.persistable) + dst.set_is_parameter(src.is_parameter) + dst.set_stop_gradient(src.stop_gradient) + + +def all_inputs_of_later_op(block, begin_idx): + """ + find all inputs of ops after an idx, used to determine the logical output of a cuda graph section + + :param block: framework.Block, the original block + :param begin_idx: int, from which idx (not include) to find the later ins + :return: a list of inputs names for all ops behind begin_idx + """ + ins = [] + for idx, op in enumerate(block.ops): + if idx <= begin_idx: + continue + for in_name in op.input_arg_names: + ins.append(in_name) + return list(set(ins)) + + +def construct_program_and_find_ins_outs(section, origin_program, section_idx): + """ + 1. Construct a new program for corresponding section + 2. Find all the logical inputs and outputs of a program section + + :param section: list, one cuda graph section, list of ops + :param origin_program: framework.Program, origin program + :param section_idx: list, the section ops' idx corresponding to the cuda graph section, a list of idx + :return: a new program for the cuda graph section + the logical ins and outs of the cuda graph section + """ + program = paddle.static.Program() + block = program.global_block() + origin_block = origin_program.global_block() + ins = [] + outs = [] + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + later_ins = all_inputs_of_later_op(origin_block, section_idx[-1]) + + for op in section: + for in_name in op.input_arg_names: + var = origin_block.var(in_name) + new_var_desc = block.desc.var(var.name.encode("ascii")) + copy_var_desc(new_var_desc, var) + if outs.count(in_name) == 0 and ins.count(in_name) == 0: + # This in var is generated from op outside this section + # Only record once for same input + ins.append(in_name) + elif later_ins.count(in_name) == 0: + # this is var is generated from op inside this section, and only will be used inside this section + outs.remove(in_name) + for out_name in op.output_arg_names: + var = origin_block.var(out_name) + new_var_desc = block.desc.var(var.name.encode("ascii")) + copy_var_desc(new_var_desc, var) + # for every output, we add it to the section's outs + if outs.count(out_name) == 0: + # Only record one out var even if it will be generated by multi ops. + # For scenario like this: + # A = op1(a) + # A = op2(b) + # B = op3(A) + outs.append(out_name) + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(op.desc) + new_op_desc._set_attr(op_role_attr_name, op.attr(op_role_attr_name)) + + program._sync_with_cpp() + + return program, [ins, outs] + + +def get_cuda_graph_sections(program): + """ + get all sections that should run under cuda graph and the corresponding idx + + :param program: framework.Program, the original program + :return: A list of cuda graph sections and the corresponding ops' idx in the block. + The program is under is test or not. + """ + block = program.global_block() + cuda_graph_sections = [] # record all ops in every cuda graph sections + sections_idx = [] # idx of all ops in every cuda graph sections + is_test = False # will be set to True is any op's 'is_test' attr is True + + # ops and it's idx between cuda graph wrapped op, may belong to a section + internal_section = [] + internal_idx = [] + + current_section = [] # current recording cuda graph sections + current_idx = [] # current recording cuda graph ops' idx + current_cuda_graph_id = -1 # current recording cuda graph id + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + loss_op_role = int(core.op_proto_and_checker_maker.OpRole.Loss) + backward_op_role = int(core.op_proto_and_checker_maker.OpRole.Backward) + loss_grad_op_role = loss_op_role | backward_op_role + + for idx, op in enumerate(block.ops): + if op.type == 'conditional_block' or op.type == 'while': + assert op._cuda_graph_attr is None, "Cuda graph not support conditional block op and while op." + if op.has_attr('is_test') and op.attr('is_test'): + is_test = True + # find cuda graph sections + if op._cuda_graph_attr is not None: + assert isinstance(op._cuda_graph_attr, + str), "cuda_graph_attr should be a str" + cuda_graph_attrs = op._cuda_graph_attr.split(';') + assert len(cuda_graph_attrs) == 3, "cuda graph attr should have three fields: " \ + "cuda graph mode, cuda graph memory pool id, cuda graph id" + local_cuda_graph_id = int(cuda_graph_attrs[2]) + if local_cuda_graph_id == current_cuda_graph_id: + if len(internal_section) > 0: + assert len(internal_section) == len( + internal_idx + ), "len of internal section should be equal with len of internal idx" + for internal_op in internal_section: + loss_related = (int(internal_op.attr(op_role_attr_name)) + == loss_op_role) or int( + (internal_op.attr(op_role_attr_name) + ) == loss_grad_op_role) + sub_block_related = (op.type == 'conditional_block' + or op.type == 'while') + if loss_related or sub_block_related: + # if loss_related is True + # The internal section contains loss related ops, + # although these ops are between two cuda graph sections with same graph id, + # they belong to none of these two sections. + # The loss related op should be wrapped by user explicitly. + + # if sub_block_related is True + # The internal section contains while op or conditional block op. + # These two ops are not supported by cuda graph. Won't extend the section. + internal_section = [] + internal_idx = [] + # Beside clear the internal section, a new cuda graph section should be recorded + assert len(current_section) == len(current_idx), \ + "num of section's op is not equal with the idx" + if len(current_section) > 0: + # store previous section + cuda_graph_sections.append(current_section) + sections_idx.append(current_idx) + current_section = [] + current_idx = [] + break + # some ops inserted by some optimizer, should be added to current section + for i in range(len(internal_section)): + current_section.append(internal_section[i]) + current_idx.append(internal_idx[i]) + internal_section = [] + current_section.append(op) + current_idx.append(idx) + else: + # current graph id is different with previous, start a new section of cuda graph + # internal ops and idx belong to no section, just clear it + internal_section = [] + internal_idx = [] + current_cuda_graph_id = local_cuda_graph_id # start record a new section + assert len(current_section) == len( + current_idx + ), "num of section's op is not equal with num of idx" + if len(current_section) > 0: + # store previous section + cuda_graph_sections.append(current_section) + sections_idx.append(current_idx) + current_section = [op] + current_idx = [idx] + else: + # recode ops which cuda_graph_attr is None, may belong to a section + internal_section.append(op) + internal_idx.append(idx) + + # handle the last section + assert len(current_section) == len( + current_idx), "num of section's op is not equal with num of idx" + if len(current_section) > 0: + # store previous section + cuda_graph_sections.append(current_section) + sections_idx.append(current_idx) + + return cuda_graph_sections, sections_idx, is_test + + +def replace_cuda_graph_section(ins_and_outs, section_program, section_idx, + origin_program, cuda_graph_section, order, + is_test): + """ + Use section_program and ins_and_outs to initialize a run_program_op, + and replace the section_idx marks ops in the origin program. + + :param ins_and_outs: list, the logical ins and outs of the section program + :param section_program: framework.Program, the partial program need to run under cuda graph + :param section_idx: list, the idx need to be removed from origin program + :param origin_program: framework.Program, the origin program + :param cuda_graph_section: list, the ops in current sections, used to get the mode, memory pool id and is_test + :param order: int, the order of current section, used to create unique cuda graph var + :param is_test: bool, the program is running under is_test or not + :return: no return + """ + ins = ins_and_outs[0] + outs = ins_and_outs[1] + insert_idx = section_idx[0] + origin_block = origin_program.global_block() + + for idx in reversed(section_idx): + # remove all cuda graph marked ops from origin block + origin_block._remove_op(idx, sync=False) + + mode = None + memory_pool_id = None + + for op in cuda_graph_section: + # find the cuda graph mode and memory pool id, determine is test or not + if op._cuda_graph_attr is not None: + attrs = op._cuda_graph_attr.split(';') + mode = attrs[0] + memory_pool_id = int(attrs[1]) + break + + assert mode is not None and memory_pool_id is not None, \ + "mode and memory pool id should be specified in cuda graph attr" + + cuda_graph_var = origin_block.create_var( + name="cuda_graph_" + str(order), + type=core.VarDesc.VarType.RAW, + persistable=True, + stop_gradient=True, + ) + + # not used for the run_program_op, just needed by the op, but won't be used + out_scope_var = origin_block.create_var( + name="program_out_scope_" + str(order), + type=core.VarDesc.VarType.STEP_SCOPES, + persistable=True, + stop_gradient=True, + ) + + program_id = _hash_with_id(section_program, ins_and_outs) + + # insert the run_program_op into the block + origin_block._insert_op(insert_idx, + type='run_program', + inputs={'X': ins}, + outputs={ + 'Out': outs, + 'OutScope': out_scope_var, + 'CUDAGraph': cuda_graph_var + }, + attrs={ + 'global_block': + section_program.global_block(), + 'start_op_index': + 0, + 'end_op_index': + len(section_program.global_block().ops), + 'is_test': + is_test, + 'program_id': + program_id, + 'cuda_graph_capture_mode': + mode, + 'cuda_graph_pool_id': + memory_pool_id, + }) + + +def cuda_graph_transform(program): + """ + replace the ops marked with cuda_graph_attr to run_program_op to use cuda graph + + :param program: framework.Program, the program to be transformed + :return: the cuda graph section program, user should hold these programs! + """ + + if len(program.blocks) > 1: + # some sub blocks may be inserted by optimizer but will not use during training, just warn here + warnings.warn( + "Sub block(s) has been detected in the program. " + "Cuda graph not support op with sub block, and it will only handle the global block." + ) + + # step 1: get all cuda graph sections. + # A cuda graph section contains all ops marked with same cuda graph id and + # some ops inserted by some optimizers (amp, sharding for example) between ops with same id. + cuda_graph_sections, sections_idx, is_test = get_cuda_graph_sections( + program) + assert len(cuda_graph_sections) == len(sections_idx), \ + "num of cuda graph sections is not equal with num of idx sections" + + # step 2: construct new program for each section and find inputs and outputs of each section. + # The inputs are variables generated outside the section but will be used by this section. + # The outputs are variables generated by this section and will be used after the end of the section. + ins_and_outs = [] + section_programs = [] + for i in range(len(cuda_graph_sections)): + # creating new program for current section + section_program, ins_outs = construct_program_and_find_ins_outs( + cuda_graph_sections[i], program, sections_idx[i]) + ins_and_outs.append(ins_outs) + section_programs.append(section_program) + assert len(section_programs) == len(cuda_graph_sections), \ + "the num of cuda graph sections should be equal with the num of new program" + + # step 3: replace the ops in original program with run_program_op. + # Will remove all ops in the section from origin program, and use run_program_op to replace them. + for i in reversed(range(len(cuda_graph_sections))): + # carry out the replacement in reversed order, to keep the previous idx intact + replace_cuda_graph_section(ins_and_outs[i], + section_programs[i], + sections_idx[i], + program, + cuda_graph_sections[i], + order=i, + is_test=is_test) + + # NOTE: user should hold these program, for now just return these program back to caller + return section_programs diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index a0b2125f16642..dcdd098dcd9cc 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -17,6 +17,7 @@ from collections import defaultdict import paddle +import paddle.utils as utils import paddle.distributed.auto_parallel as auto from paddle import fluid, static @@ -26,9 +27,9 @@ from paddle.fluid import core from paddle.fluid import program_guard from paddle.fluid.layers.utils import flatten -from paddle.fluid.executor import global_scope +from paddle.fluid.executor import global_scope, _to_name_str from paddle.fluid.backward import append_backward -from paddle.fluid.framework import Operator, Variable +from paddle.fluid.framework import Operator from paddle.fluid.framework import _current_expected_place as _get_device from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.distributed import fleet @@ -137,7 +138,8 @@ def _build(self): metrics = [] serial_main_prog = self._orig_main_prog.clone() serial_startup_prog = self._orig_startup_prog.clone() - with static.program_guard(serial_main_prog, serial_startup_prog): + with static.program_guard(serial_main_prog, serial_startup_prog), \ + utils.unique_name.guard(): inputs_spec = self.inputs_spec labels_spec = self.labels_spec if self.labels_spec else [] inputs = [s._create_feed_layer() for s in inputs_spec] @@ -256,7 +258,7 @@ def fit(self, train_data, batch_size=1, epochs=1, - fetch_list=None, + fetches=None, steps_per_epoch=None, use_program_cache=False, return_numpy=True): @@ -267,134 +269,131 @@ def fit(self, "train model is not ready, please call `engine.prepare()` first." train_dataloader = self._create_dataloader(train_data, batch_size, epochs, steps_per_epoch) - self._usr_fetch_list = fetch_list - outputs = [] + usr_fetch = self._to_map_fetch(fetches) + fetch_loss = self._inner_fetch(self.fetch_vars["loss"]) + fetch_list, fetch_map = self._fetch_map(fetch_loss, usr_fetch) + for epoch in range(epochs): - for step, data in enumerate(train_dataloader): - logs, outs = self._train_step(data, use_program_cache, - return_numpy) - outputs.append(outs) - train_logs = { - "train_" + name: val - for name, val in logs.items() - } + train_logs = {"epoch": epoch} + for step, _ in enumerate(train_dataloader): + outs = self._executor.run(self.main_program, + fetch_list=fetch_list, + use_program_cache=use_program_cache, + return_numpy=return_numpy) + train_logs["step"] = step + # inner fetches + if fetch_loss: + train_logs["train_loss"] = outs[0][0] + # user fetches + user_outs = outs[len(fetch_loss):] + user_fetch_list = fetch_list[len(fetch_loss):] + for i, out in enumerate(user_outs): + train_logs["train_" + + fetch_map[user_fetch_list[i]]] = out[0] self._logger.info(train_logs) - return outputs def evaluate(self, eval_data, batch_size=1, - fetch_list=None, + fetches=None, use_program_cache=False, return_numpy=True): self.mode = 'eval' assert self.mode in self._dist_main_progs, \ "eval model is not ready, please call `engine.prepare()` first." eval_dataloader = self._create_dataloader(eval_data, batch_size) - self._usr_fetch_list = fetch_list - - for step, data in enumerate(eval_dataloader): - eval_logs = dict() - logs, outs = self._eval_step(data, use_program_cache, return_numpy) - eval_logs["eval_loss"] = outs[0] if len(outs) > 0 else [] - for metric in self._metrics: - results = metric.accumulate() - for i, res in enumerate(to_list(results)): - eval_logs["eval_" + metric.name()[i]] = res - for name, val in logs.items(): - eval_logs["eval_" + name] = val + + usr_fetch = self._to_map_fetch(fetches) + fetch_loss = self._inner_fetch(self.fetch_vars["loss"]) + fetch_metrics = self._inner_fetch(self.fetch_vars["metrics"]) + inner_fetch = dict(fetch_loss, **fetch_metrics) + fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch) + + for step, _ in enumerate(eval_dataloader): + eval_logs = {"step": step} + outs = self._executor.run(self.main_program, + fetch_list=fetch_list, + use_program_cache=use_program_cache, + return_numpy=return_numpy) + # inner fetches + if fetch_loss: + eval_logs["eval_loss"] = outs[0] + # Metric + if fetch_metrics: + metric_out = outs[len(fetch_loss):len(inner_fetch)] + for metric in self._metrics: + metric.update(*metric_out) + results = metric.accumulate() + for i, res in enumerate(to_list(results)): + eval_logs["eval_" + metric.name()[i]] = res + # usr fetches + usr_out = outs[len(inner_fetch):] + usr_fetch_list = fetch_list[len(inner_fetch):] + for i, out in enumerate(usr_out): + eval_logs["eval_" + fetch_map[usr_fetch_list[i]]] = out + # logger self._logger.info(eval_logs) - return eval_logs def predict(self, test_data, batch_size=1, - fetch_list=None, + fetches=None, use_program_cache=False, return_numpy=True): self.mode = 'predict' assert self.mode in self._dist_main_progs, \ "predict model is not ready, please call `engine.prepare()` first." test_dataloader = self._create_dataloader(test_data, batch_size) - self._usr_fetch_list = fetch_list + + usr_fetch = self._to_map_fetch(fetches) + fetch_outputs = self._inner_fetch(self.fetch_vars["outputs"]) + fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch) outputs = [] - for step, data in enumerate(test_dataloader): - logs, outs = self._predict_step(data, use_program_cache, - return_numpy) - outputs.append(outs) - predict_logs = {"pred_" + name: val for name, val in logs.items()} + for step, _ in enumerate(test_dataloader): + predict_logs = {"step": step} + outs = self._executor.run(self.main_program, + fetch_list=fetch_list, + use_program_cache=use_program_cache, + return_numpy=return_numpy) + outputs.append(outs[:len(fetch_outputs)]) + for i, out in enumerate(outs): + predict_logs["pred_" + fetch_map[fetch_list[i]]] = out[0] self._logger.info(predict_logs) + return outputs - def _train_step(self, data, use_program_cache=False, return_numpy=True): - logs = {} - fetch_vars = self._fetch_vars[self.mode]["loss"] - fetch_list, usr_fetch_list = self._fetch_list(fetch_vars) - fetch_list += usr_fetch_list - - outs = self._executor.run(self.main_program, - fetch_list=fetch_list, - use_program_cache=use_program_cache, - return_numpy=return_numpy) - for i, out in enumerate(outs): - logs[fetch_list[i]] = out - return logs, outs - - def _eval_step(self, data, use_program_cache=False, return_numpy=True): - logs = {} - metrics = self._fetch_vars[self.mode]["metrics"] - losses = self._fetch_vars[self.mode]["loss"] - fetch_loss, usr_fetch_list = self._fetch_list(losses) - fetch_metrics, usr_fetch_list = self._fetch_list(metrics) - fetch_list = fetch_loss + fetch_metrics - - outs = self._executor.run(self.main_program, - fetch_list=fetch_list + usr_fetch_list, - use_program_cache=use_program_cache, - return_numpy=return_numpy) - usr_out = outs[len(fetch_list):] - for i, out in enumerate(usr_out): - logs[usr_fetch_list[i]] = out - outs = outs[:len(fetch_list)] - if not outs[len(fetch_loss):]: - return logs, outs[:len(fetch_loss)] - for metric in self._metrics: - metric.update(*outs[len(fetch_loss):]) - return logs, outs[:len(fetch_loss)] - - def _predict_step(self, data, use_program_cache=False, return_numpy=True): - logs = {} - fetch_vars = self._fetch_vars[self.mode]["outputs"] - fetch_list, usr_fetch_list = self._fetch_list(fetch_vars) - fetch_list += usr_fetch_list - - outs = self._executor.run(self.main_program, - fetch_list=fetch_list, - use_program_cache=use_program_cache, - return_numpy=return_numpy) - for i, out in enumerate(outs): - logs[fetch_list[i]] = out - return logs, outs - - def _fetch_list(self, fetch_vars): - fetch_list = [] - for var in fetch_vars: - if var.name in self.main_program.global_block().vars: - fetch_list.append(var.name) - usr_fetch_list = [] - if self._usr_fetch_list: - assert isinstance(self._usr_fetch_list, - list), "'fetch_list' type should be list." - for var in self._usr_fetch_list: - if isinstance(var, str): - if var in self.main_program.global_block().vars: - usr_fetch_list.append(var) - elif isinstance(var, Variable): - if var.name in self.main_program.global_block().vars: - usr_fetch_list.append(var.name) - return fetch_list, usr_fetch_list + def _local_var(self, var): + var_name = _to_name_str(var) + return var_name in self.main_program.global_block().vars + + def _to_map_fetch(self, fetches): + if not fetches: + return {} + if isinstance(fetches, dict): + fetch_var_names = list(map(_to_name_str, fetches.values())) + usr_fetches = dict(zip(fetch_var_names, list(fetches.keys()))) + elif isinstance(fetches, list): + fetch_var_names = list(map(_to_name_str, fetches)) + usr_fetches = dict(zip(fetch_var_names, fetch_var_names)) + return dict(filter(lambda x: self._local_var(x[0]), + usr_fetches.items())) + + def _inner_fetch(self, fetch_vars): + fetch_list = list( + map(lambda x: x.name, list(filter(self._local_var, fetch_vars)))) + inner_fetches = dict(zip(fetch_list, fetch_list)) + return inner_fetches + + def _fetch_map(self, inner_fetch, usr_fetch): + # replace inner fetch name if usr set for it + for iname in inner_fetch: + if iname in usr_fetch: + inner_fetch[iname] = usr_fetch[iname] + usr_fetch.pop(iname) + fetches = dict(inner_fetch, **usr_fetch) + return list(fetches.keys()), fetches def _create_dataloader(self, dataset, @@ -515,7 +514,8 @@ def save(self, path, training=True, mode=None): mode = self.mode if training: - assert 'train' in self._serial_main_progs, "training model is not ready, please call `engine.prepare(mode='train')` first." + assert 'train' in self._serial_main_progs, \ + "training model is not ready, please call `engine.prepare()` first." serial_program = self._serial_main_progs["train"] dist_main_prog = self._dist_main_progs["train"][self._cur_rank] dist_context = self._dist_contexts["train"] @@ -571,3 +571,7 @@ def serial_main_program(self): @property def serial_startup_program(self): return self._serial_startup_progs[self.mode] + + @property + def fetch_vars(self): + return self._fetch_vars[self.mode] diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 762b961da53ba..d41f0fbb84570 100755 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -1811,8 +1811,8 @@ def unscale_method(self, optimizer): if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP32) ] - temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) - temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_)) if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, param_grads_fp16, diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index b1e0f6cc13068..fcbbadbe12159 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -200,8 +200,8 @@ def unscale_method(self, optimizer): else: param_grads_fp32.append(param.grad) - temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) - temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_)) device = "cpu" if optimizer.offload else "gpu" dev_id = 0 if device == "cpu" else int( diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index ae98d4bdf7b1e..63e2b91b3d9bd 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -201,8 +201,8 @@ def unscale_method(self, optimizer): else: param_grads_fp32.append(param.grad) - temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) - temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_)) device = "cpu" if optimizer.offload else "gpu" dev_id = 0 if device == "cpu" else int( diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index 423536b095a40..1f4439cf1171f 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -73,8 +73,6 @@ class EagerRecomputeFunction(EagerPyLayer): @staticmethod def forward(ctx, run_function, preserve_rng_state, *args): from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker - if framework._dygraph_tracer()._has_grad: - check_recompute_necessary(args) # store for recomputing ctx.run_function = run_function @@ -211,8 +209,6 @@ class RecomputeFunction(PyLayer): @staticmethod def forward(ctx, run_function, preserve_rng_state, *args): from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker - if framework._dygraph_tracer()._has_grad: - check_recompute_necessary(args) # store for recomputing ctx.run_function = run_function @@ -466,6 +462,9 @@ def run_model(cuda_state, recompute_block=[], recompute_kwargs={}): raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwargs)) + if framework._dygraph_tracer()._has_grad: + check_recompute_necessary(args) + if in_dygraph_mode(): return EagerRecomputeFunction.apply(function, preserve, *args) else: diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py index 7df7db28f7877..f03d0ea3d41ef 100644 --- a/python/paddle/distributed/launch/context/device.py +++ b/python/paddle/distributed/launch/context/device.py @@ -21,6 +21,7 @@ class DeviceType: XPU = 'xpu' NPU = 'npu' MLU = 'mlu' + IPU = 'ipu' class Device(object): @@ -69,6 +70,8 @@ def get_selected_device_key(self): return 'FLAGS_selected_xpus' if self._dtype == DeviceType.MLU: return 'FLAGS_selected_mlus' + if self._dtype == DeviceType.IPU: + return 'FLAGS_selected_ipus' return 'FLAGS_selected_devices' def get_selected_devices(self, devices=''): @@ -130,6 +133,12 @@ def detect_device(self): dev._dtype = DeviceType.MLU num = fluid.core.get_mlu_device_count() visible_devices = os.getenv("MLU_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_ipu(): + dev._dtype = DeviceType.IPU + num = fluid.core.get_ipu_device_count() + # For IPUs, 'labels' is a list which contains the available numbers of IPU devices. + dev._labels = [str(x) for x in range(0, num + 1)] + return dev if num == 0: dev._dtype = DeviceType.CPU diff --git a/python/paddle/distributed/launch/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py index f1c6ea5399a46..c686164dbd884 100644 --- a/python/paddle/distributed/launch/controllers/__init__.py +++ b/python/paddle/distributed/launch/controllers/__init__.py @@ -17,9 +17,11 @@ from .collective import CollectiveController from .collective import CollectiveElasticController from .ps import PSController +from .ipu_controller import IPUController # the order is extremely important _controllers = [ + IPUController, CollectiveElasticController, PSController, CollectiveController, diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index a8ae155562ae9..1f43679d748f1 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -29,6 +29,7 @@ class ControleMode: COLLECTIVE = "collective" PS = "ps" + IPU = "ipu" class ControllerBase(object): diff --git a/python/paddle/distributed/launch/controllers/ipu_controller.py b/python/paddle/distributed/launch/controllers/ipu_controller.py new file mode 100644 index 0000000000000..92dc2960ab624 --- /dev/null +++ b/python/paddle/distributed/launch/controllers/ipu_controller.py @@ -0,0 +1,170 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import argparse + +from .collective import CollectiveController, ControleMode +from paddle.distributed.launch.job.container import Container + + +class IPUController(CollectiveController): + + @classmethod + def enable(cls, ctx): + if ctx.args.training_script == "ipu": + ctx.logger.debug("{} enabled".format(cls.__name__)) + ctx.args.run_mode = ControleMode.IPU + return True + else: + return False + + def parse_ipu_args(self, args_list): + parser = argparse.ArgumentParser() + parser.add_argument("--hosts", + type=str, + help="The hosts for IPU distributd training.") + parser.add_argument("--nproc_per_host", + type=int, + help="The number of processes launched per host.") + parser.add_argument("--ipus_per_replica", + type=int, + help="The number of IPUs requested per replica.") + parser.add_argument("--ipu_partition", + type=str, + help="The partition name of IPU devices.") + parser.add_argument("--vipu_server", + type=str, + help="The ip of the IPU device manager.") + parser.add_argument( + "training_script", + type=str, + help= + "The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``." + ) + parser.add_argument('training_script_args', nargs=argparse.REMAINDER) + return parser.parse_args(args_list) + + def replace_training_script(self): + # IPU distributed computing is based on PopRun which is a wrapper of MPI. + self.ctx.args.training_script = "poprun" + poprun_args = self.parse_ipu_args(self.ctx.args.training_script_args) + + num_ipus = int(self.ctx.args.devices) + # The number of replicas for data parallel + assert (num_ipus % poprun_args.ipus_per_replica) == 0, \ + "The number of IPUs:{} mod the number of IPUs per replica:{} must == 0".format(num_ipus, poprun_args.ipus_per_replica) + num_replicas = num_ipus // poprun_args.ipus_per_replica + self.ctx.logger.info( + "The number of total replicas is {}.".format(num_replicas)) + + # The number of processes + num_nodes = len(poprun_args.hosts.split(',')) + num_procs = num_nodes * poprun_args.nproc_per_host + self.ctx.logger.info( + "The number of total processes is {}.".format(num_procs)) + assert (num_replicas % num_procs) == 0, \ + "The number of replicas:{} mod the number of processes:{} must == 0".format(num_replicas, num_procs) + + # hosts and endpoints + hosts = poprun_args.hosts.replace(' ', '').split(',') + endpoints = [x + ":8090" for x in hosts] + + # args for poprun + poprun_command = [] + + poprun_command.append('--num-instances={}'.format(num_procs)) + poprun_command.append('--num-replicas={}'.format(num_replicas)) + poprun_command.append('--ipus-per-replica={}'.format( + poprun_args.ipus_per_replica)) + poprun_command.append('--host={}'.format(','.join(hosts))) + poprun_command.append('--vipu-partition={}'.format( + poprun_args.ipu_partition)) + poprun_command.append('--vipu-server-host={}'.format( + poprun_args.vipu_server)) + + poprun_command.extend([ + '--update-partition=no', '--vipu-server-timeout=120', + '--print-topology=yes', '--numa-aware=yes' + ]) + + # global envs + global_envs = '--mpi-local-args=\'' + log_level = os.getenv('POPART_LOG_LEVEL', None) + if log_level: + global_envs += '-x POPART_LOG_LEVEL={} '.format(log_level) + global_envs += '-x PADDLE_TRAINERS_NUM={} -x PADDLE_TRAINER_ENDPOINTS={}'.format( + num_procs, ','.join(endpoints)) + global_envs += '\'' + poprun_command.append(global_envs) + + # local envs + for idx in range(num_procs): + cur_endpoint = endpoints[idx // poprun_args.nproc_per_host] + rank_in_node = idx % poprun_args.nproc_per_host + poprun_command.append( + '--instance-mpi-local-args={}:\"-x PADDLE_TRAINER_ID={} -x PADDLE_CURRENT_ENDPOINT={} -x PADDLE_RANK_IN_NODE={}\"' + .format(idx, idx, cur_endpoint, rank_in_node)) + + # executor + poprun_command.append(sys.executable) + + # script and script args + poprun_command.append(poprun_args.training_script) + poprun_command.extend(poprun_args.training_script_args) + + # for debug + print("----------- PopRun Command -----------") + print("poprun \\") + for i in range(len(poprun_command) - 1): + print("%s \\" % (poprun_command[i])) + print("%s" % (poprun_command[len(poprun_command) - 1])) + print("---------------------------------------") + + # replace training_script_args + self.ctx.args.training_script_args = poprun_command + + def _get_entrypoint(self): + entrypoint = [self.ctx.args.training_script] + entrypoint.extend(self.ctx.args.training_script_args) + entrypoint = [" ".join(entrypoint)] + return entrypoint + + def new_container(self, + entrypoint=None, + envs={}, + use_ctx_env=True, + out=None, + err=None): + c = Container( + entrypoint=(entrypoint or self._get_entrypoint()), + env=(self.ctx.get_envs() if use_ctx_env else {}), + ) + c.outfile, c.errfile = self._get_out_err_file(out, err) + c.update_env(envs) + # Need subprocess.Popen(shell=True) for PopRun command + c.shell = True + return c + + def run(self): + # Replace the training script with the PopRun command + self.replace_training_script() + + self.build_job() + self.build_pod() + + self.deploy_pod() + + self.watch() diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py index 9f7b1733d1af2..8f515d9e6f38b 100644 --- a/python/paddle/distributed/launch/job/container.py +++ b/python/paddle/distributed/launch/job/container.py @@ -37,6 +37,7 @@ def __init__(self, entrypoint=[], rank=-1, env={}): self._grace_period = 10 self._log_handler = None + self._shell = False @property def entrypoint(self): @@ -70,6 +71,14 @@ def errfile(self): def errfile(self, err): self._err = err + @property + def shell(self): + return self._shell + + @shell.setter + def shell(self, shell): + self._shell = shell + def update_env(self, env={}, **kwargs): env = {k: v for k, v in env.items() if isinstance(v, str)} self._env.update(env) @@ -109,7 +118,8 @@ def start(self): self._proc = ProcessContext(self._entrypoint, env=self._env, out=self._stdout, - err=self._stderr) + err=self._stderr, + shell=self._shell) self._proc.start() def terminate(self, force=False): diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index f90fa7401e9a0..4c1b99df178ea 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -91,6 +91,26 @@ def launch(): - ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``. + IPU Parameters: + IPU distributed launch only requires and allowes three arguments ``--devices``, ``training_script`` and ``training_script_args``. + The ``--devices`` is the number of IPU devices. e.g., ``--devices=4`` will launch the training program with four IPU devices. + The ``training_script`` is only allowed to set as ``ipu``. + The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below. + ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs. + + - ``--hosts``: The hosts for IPU distributd training. + + - ``--nproc_per_host``: The number of processes launched per host. + + - ``--ipus_per_replica``: The number of IPUs requested per replica. + + - ``--ipu_partition``: The partition name of IPU devices. + + - ``--vipu_server``: The ip of the IPU device manager. + + - ``training_script``: The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``. + + - ``training_script_args``: The args of the IPU distributed training program/script. Returns: - ``None`` @@ -229,6 +249,15 @@ def launch(): # once the number of nodes changes between 2:4 during training, the strategy holds + Examples 10 (ipu): + .. code-block:: bash + :name: code-block-example-bash10 + + # With the following command, the job will begin to run the distributhed program with IPUs. + # Only support and require the `device_num` as the arg and `ipu` as the launch script. + # Please Check the details about the following args of the launch scripte from `utils/ipu_launch.py`. + python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py + """ # initialize the context to run diff --git a/python/paddle/distributed/launch/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py index 075536c8a8cb5..5d8505aa66eb3 100644 --- a/python/paddle/distributed/launch/utils/process_context.py +++ b/python/paddle/distributed/launch/utils/process_context.py @@ -24,7 +24,8 @@ def __init__(self, out=sys.stdout, err=sys.stderr, group=True, - preexec_fn=None): + preexec_fn=None, + shell=False): self._cmd = cmd self._env = env self._preexec_fn = preexec_fn @@ -33,6 +34,7 @@ def __init__(self, self._group = group if os.name != 'nt' else False self._proc = None self._code = None + self._shell = shell def _start(self): pre_fn = os.setsid if self._group else None @@ -40,7 +42,8 @@ def _start(self): env=self._env, stdout=self._stdout, stderr=self._stderr, - preexec_fn=self._preexec_fn or pre_fn) + preexec_fn=self._preexec_fn or pre_fn, + shell=self._shell) def _close_std(self): try: diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 5c16e0fe273c4..a4888e6f90655 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -17,6 +17,10 @@ import logging import numpy as np import shutil +try: + from tqdm import tqdm +except: + from .utils import tqdm from inspect import isgeneratorfunction from .... import io from .... import core @@ -359,38 +363,41 @@ def quantize(self): self._set_activation_persistable() if self._algo in ["KL", "hist"]: - _logger.info("Preparation stage ...") batch_id = 0 + with tqdm( + total=self._batch_nums, + bar_format= + 'Preparation stage, Run batch:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: + for data in self._data_loader(): + self._executor.run(program=self._program, + feed=data, + fetch_list=self._fetch_list, + return_numpy=False, + scope=self._scope) + self._collect_activation_abs_min_max() + batch_id += 1 + t.update() + if self._batch_nums and batch_id >= self._batch_nums: + break + self._init_sampling_act_histogram() + + batch_id = 0 + with tqdm(total=self._batch_nums, + bar_format= + 'Sampling stage, Run batch:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: for data in self._data_loader(): self._executor.run(program=self._program, feed=data, fetch_list=self._fetch_list, return_numpy=False, scope=self._scope) - self._collect_activation_abs_min_max() - if batch_id % 5 == 0: - _logger.info("Run batch: " + str(batch_id)) + self._sampling() batch_id += 1 + t.update() if self._batch_nums and batch_id >= self._batch_nums: break - _logger.info("Finish preparation stage, all batch:" + str(batch_id)) - self._init_sampling_act_histogram() - - _logger.info("Sampling stage ...") - batch_id = 0 - for data in self._data_loader(): - self._executor.run(program=self._program, - feed=data, - fetch_list=self._fetch_list, - return_numpy=False, - scope=self._scope) - self._sampling() - if batch_id % 5 == 0: - _logger.info("Run batch: " + str(batch_id)) - batch_id += 1 - if self._batch_nums and batch_id >= self._batch_nums: - break - _logger.info("Finish sampling stage, all batch: " + str(batch_id)) if self._algo == 'avg': for var_name in self._quantized_act_var_name: diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index eaf9bed3d6fe9..0dd79992eb1df 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -14,6 +14,10 @@ import collections import numpy as np +try: + from tqdm import tqdm +except: + from .utils import tqdm from ..... import compat as cpt from .... import core from ....framework import IrGraph @@ -373,10 +377,15 @@ def _has_weight(op): graph.out_node_mapping_table = dict() # The process of _transform_forward and _transform_backward is needed in two for loops. # The loop for transforming the forward graph: - for op in ops: - if op.name() in self._quantizable_ops: - if not self._is_skip_quant(graph, op) and _has_weight(op): - _transform_forward(graph, op) + with tqdm(total=len(ops), + bar_format= + 'Adding quant op with weight:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: + for op in ops: + if op.name() in self._quantizable_ops: + if not self._is_skip_quant(graph, op) and _has_weight(op): + _transform_forward(graph, op) + t.update() # The loop for renaming the inputs of backward op. for op in ops: if op.name() in self._quantizable_grad_ops and _has_weight(op): @@ -1418,73 +1427,81 @@ def apply(self, graph): for op in graph.all_op_nodes(): if op.name() in self._teller_set: target_ops.append(op) - for op in target_ops: - for output_var_name in utils._get_op_output_var_names(op): - in_node = graph._find_node_by_name(op.outputs, output_var_name) - if in_node.dtype() not in \ - [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: - continue + with tqdm(total=len(target_ops), + bar_format='Adding OutScale op:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: + for op in target_ops: + for output_var_name in utils._get_op_output_var_names(op): + in_node = graph._find_node_by_name(op.outputs, + output_var_name) + if in_node.dtype() not in \ + [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: + continue - scale_node = graph.create_persistable_node( - name=self._scale_name(in_node.name()), - var_type=core.VarDesc.VarType.LOD_TENSOR, - shape=[1], - var_dtype=in_node.dtype()) - data_type = 'float64' if in_node.dtype() \ - == core.VarDesc.VarType.FP64 else 'float32' - _init_var_node(scale_node, np.ones([1], dtype=data_type), - self._scope, self._place) - ins = {'X': in_node} - outs = {'OutScale': scale_node} - if not self._is_test: - state_in_node = graph.create_persistable_node( - name=unique_name.generate('scale_state@'), + scale_node = graph.create_persistable_node( + name=self._scale_name(in_node.name()), var_type=core.VarDesc.VarType.LOD_TENSOR, - var_dtype=in_node.dtype(), - shape=[1]) - _init_var_node(state_in_node, np.ones([1], dtype=data_type), + shape=[1], + var_dtype=in_node.dtype()) + data_type = 'float64' if in_node.dtype() \ + == core.VarDesc.VarType.FP64 else 'float32' + _init_var_node(scale_node, np.ones([1], dtype=data_type), self._scope, self._place) - accum_in_node = graph.create_persistable_node( - name=unique_name.generate('scale_accum@'), - var_type=core.VarDesc.VarType.LOD_TENSOR, - var_dtype=in_node.dtype(), - shape=[1]) - _init_var_node(accum_in_node, np.ones([1], dtype=data_type), - self._scope, self._place) - state_out_node = graph.create_var_node_from_desc( - state_in_node.var()) - accum_out_node = graph.create_var_node_from_desc( - accum_in_node.var()) - - ins['InState'] = state_in_node - ins['InAccum'] = accum_in_node - outs['OutState'] = state_out_node - outs['OutAccum'] = accum_out_node - - attrs = { - 'moving_rate': self._moving_rate, - 'is_test': self._is_test, - 'op_role': core.op_proto_and_checker_maker.OpRole.Forward - } - scale_op_node = graph.create_op_node( - op_type='moving_average_abs_max_scale', - attrs=attrs, - inputs=ins, - outputs=outs) - graph.link_to(in_node, scale_op_node) - graph.link_to(scale_op_node, scale_node) - if not self._is_test: - graph.link_to(state_in_node, scale_op_node) - graph.link_to(accum_in_node, scale_op_node) - graph.link_to(scale_op_node, state_out_node) - graph.link_to(scale_op_node, accum_out_node) + ins = {'X': in_node} + outs = {'OutScale': scale_node} + if not self._is_test: + state_in_node = graph.create_persistable_node( + name=unique_name.generate('scale_state@'), + var_type=core.VarDesc.VarType.LOD_TENSOR, + var_dtype=in_node.dtype(), + shape=[1]) + _init_var_node(state_in_node, + np.ones([1], dtype=data_type), + self._scope, self._place) + accum_in_node = graph.create_persistable_node( + name=unique_name.generate('scale_accum@'), + var_type=core.VarDesc.VarType.LOD_TENSOR, + var_dtype=in_node.dtype(), + shape=[1]) + _init_var_node(accum_in_node, + np.ones([1], dtype=data_type), + self._scope, self._place) + state_out_node = graph.create_var_node_from_desc( + state_in_node.var()) + accum_out_node = graph.create_var_node_from_desc( + accum_in_node.var()) + + ins['InState'] = state_in_node + ins['InAccum'] = accum_in_node + outs['OutState'] = state_out_node + outs['OutAccum'] = accum_out_node + + attrs = { + 'moving_rate': self._moving_rate, + 'is_test': self._is_test, + 'op_role': + core.op_proto_and_checker_maker.OpRole.Forward + } + scale_op_node = graph.create_op_node( + op_type='moving_average_abs_max_scale', + attrs=attrs, + inputs=ins, + outputs=outs) + graph.link_to(in_node, scale_op_node) + graph.link_to(scale_op_node, scale_node) + if not self._is_test: + graph.link_to(state_in_node, scale_op_node) + graph.link_to(accum_in_node, scale_op_node) + graph.link_to(scale_op_node, state_out_node) + graph.link_to(scale_op_node, accum_out_node) + t.update() return graph def _scale_name(self, var_name): """ Return the scale name for the var named `var_name`. """ - return "%s.scale" % (var_name) + return "%s@scale" % (var_name) class OutScaleForInferencePass(object): @@ -1544,7 +1561,7 @@ def _scale_name(self, var_name): """ Return the scale name for the var named `var_name`. """ - return "%s.scale" % (var_name) + return "%s@scale" % (var_name) class AddQuantDequantPass(object): @@ -1624,36 +1641,43 @@ def apply(self, graph): # Forward stage, insert quant_dequant op all_op_nodes = graph.all_op_nodes() - for op_node in all_op_nodes: - if op_node.name() in self._quantizable_op_type: - is_skip = False - if isinstance(self._skip_pattern, list): - is_skip = op_node.op().has_attr("op_namescope") and \ - any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern) - elif isinstance(self._skip_pattern, str): - is_skip = op_node.op().has_attr("op_namescope") and \ - op_node.op().attr("op_namescope").find(self._skip_pattern) != -1 - is_quantized = op_node.op().has_attr("quantization_type") and \ - op_node.op().attr("quantization_type") == "qat_with_weight" - if is_skip or is_quantized or \ - (not _is_input_all_not_persistable(graph, op_node)): - continue + with tqdm(total=len(all_op_nodes), + bar_format= + 'Adding quant activation op:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: + for op_node in all_op_nodes: + if op_node.name() in self._quantizable_op_type: + is_skip = False + if isinstance(self._skip_pattern, list): + is_skip = op_node.op().has_attr("op_namescope") and \ + any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern) + elif isinstance(self._skip_pattern, str): + is_skip = op_node.op().has_attr("op_namescope") and \ + op_node.op().attr("op_namescope").find(self._skip_pattern) != -1 + is_quantized = op_node.op().has_attr("quantization_type") and \ + op_node.op().attr("quantization_type") == "qat_with_weight" + if is_skip or is_quantized or \ + (not _is_input_all_not_persistable(graph, op_node)): + continue - op_node.op()._set_attr("quantization_type", - "qat_without_weight") - op_node.op()._set_attr("activation_bits", self._quant_bits) - op_node.op()._set_attr("with_quant_attr", True) - arg_names = utils._get_op_input_var_names(op_node) - for arg_name in arg_names: - in_node = graph._find_node_by_name(op_node.inputs, arg_name) - if arg_name in dequantized_vars_map: - quant_var_node = dequantized_vars_map[arg_name] - else: - quant_var_node, _ = \ - self._inser_quant_dequant_moving_average_abs_max_op( - graph, in_node, self._quant_bits) - dequantized_vars_map[arg_name] = quant_var_node - graph.update_input_link(in_node, quant_var_node, op_node) + op_node.op()._set_attr("quantization_type", + "qat_without_weight") + op_node.op()._set_attr("activation_bits", self._quant_bits) + op_node.op()._set_attr("with_quant_attr", True) + arg_names = utils._get_op_input_var_names(op_node) + for arg_name in arg_names: + in_node = graph._find_node_by_name( + op_node.inputs, arg_name) + if arg_name in dequantized_vars_map: + quant_var_node = dequantized_vars_map[arg_name] + else: + quant_var_node, _ = \ + self._inser_quant_dequant_moving_average_abs_max_op( + graph, in_node, self._quant_bits) + dequantized_vars_map[arg_name] = quant_var_node + graph.update_input_link(in_node, quant_var_node, + op_node) + t.update() # Backward stage, update input link for op_node in all_op_nodes: @@ -2204,10 +2228,16 @@ def apply(self, graph): graph.out_node_mapping_table = dict() # The process of _transform_forward and _transform_backward is needed in two for loops. # The loop for transforming the forward graph: - for op in ops: - if op.name() in self._quantizable_ops: - if not self._is_skip_quant(graph, op) and self._has_weight(op): - self._transform_forward(graph, op) + with tqdm(total=len(ops), + bar_format= + 'Adding quant op with weight:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: + for op in ops: + if op.name() in self._quantizable_ops: + if not self._is_skip_quant(graph, + op) and self._has_weight(op): + self._transform_forward(graph, op) + t.update() # The loop for renaming the inputs of backward op. for op in ops: if op.name() in self._quantizable_grad_ops and self._has_weight(op): @@ -2310,43 +2340,50 @@ def apply(self, graph): # Forward stage, insert quant_dequant op all_op_nodes = graph.all_op_nodes() - for op_node in all_op_nodes: - if op_node.name() in self._quantizable_op_type: - is_skip = False - if isinstance(self._skip_pattern, list): - is_skip = op_node.op().has_attr("op_namescope") and \ - any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern) - elif isinstance(self._skip_pattern, str): - is_skip = op_node.op().has_attr("op_namescope") and \ - op_node.op().attr("op_namescope").find(self._skip_pattern) != -1 - is_quantized = op_node.op().has_attr("quantization_type") and \ - op_node.op().attr("quantization_type") == "qat_with_weight" - if is_skip or is_quantized: - continue - - op_node.op()._set_attr("quantization_type", - "qat_without_weight") - arg_names = utils._get_op_input_var_names(op_node) - for arg_name in arg_names: - in_node = graph._find_node_by_name(op_node.inputs, arg_name) - if in_node.persistable(): + with tqdm(total=len(all_op_nodes), + bar_format= + 'Adding quant activation op:|{bar}| {n_fmt}/{total_fmt}', + ncols=80) as t: + for op_node in all_op_nodes: + if op_node.name() in self._quantizable_op_type: + is_skip = False + if isinstance(self._skip_pattern, list): + is_skip = op_node.op().has_attr("op_namescope") and \ + any(pattern in op_node.op().attr("op_namescope") for pattern in self._skip_pattern) + elif isinstance(self._skip_pattern, str): + is_skip = op_node.op().has_attr("op_namescope") and \ + op_node.op().attr("op_namescope").find(self._skip_pattern) != -1 + is_quantized = op_node.op().has_attr("quantization_type") and \ + op_node.op().attr("quantization_type") == "qat_with_weight" + if is_skip or is_quantized: continue - if arg_name in dequantized_vars_map: - dequant_var_node = dequantized_vars_map[arg_name] - else: - insert_quant_pass = InsertQuantizeLinear( - self._place, - self._scope, - quant_bits=self._quant_bits, - quant_axis=-1, - channel_wise=False, - is_test=self._is_test) - quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op( - graph, in_node) - dequant_var_node = insert_quant_pass.insert_dequant_op( - graph, quant_var_node, scale_var_node) - dequantized_vars_map[arg_name] = dequant_var_node - graph.update_input_link(in_node, dequant_var_node, op_node) + + op_node.op()._set_attr("quantization_type", + "qat_without_weight") + arg_names = utils._get_op_input_var_names(op_node) + for arg_name in arg_names: + in_node = graph._find_node_by_name( + op_node.inputs, arg_name) + if in_node.persistable(): + continue + if arg_name in dequantized_vars_map: + dequant_var_node = dequantized_vars_map[arg_name] + else: + insert_quant_pass = InsertQuantizeLinear( + self._place, + self._scope, + quant_bits=self._quant_bits, + quant_axis=-1, + channel_wise=False, + is_test=self._is_test) + quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op( + graph, in_node) + dequant_var_node = insert_quant_pass.insert_dequant_op( + graph, quant_var_node, scale_var_node) + dequantized_vars_map[arg_name] = dequant_var_node + graph.update_input_link(in_node, dequant_var_node, + op_node) + t.update() # Backward stage, update input link for op_node in all_op_nodes: diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py index 608844dd55da7..b9c304df5bafe 100644 --- a/python/paddle/fluid/contrib/slim/quantization/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import numpy as np from ....framework import IrNode from ....framework import Operator @@ -52,7 +53,6 @@ "leaky_relu", "tanh", "swish", - "scale", "transpose", "transpose2", "sigmoid", @@ -162,7 +162,6 @@ "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], "elementwise_pow": [["X", "Y"], ["Out"]], - "scale": [["X"], ["Out"]], "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], @@ -414,3 +413,27 @@ def calculate_quant_cos_error(orig_tensor, qdq_tensor): cos_sim = np.inner(orig_tensor.flatten(), qdq_tensor.flatten()) \ / (np.linalg.norm(orig_tensor.flatten()) * np.linalg.norm(qdq_tensor.flatten())) return cos_sim + + +class tqdm(object): + + def __init__(self, total, bar_format='Loading|{bar}', ncols=80): + self.total = total + self.bar_format = bar_format + self.ncols = ncols + self.n = 0 + + def update(self, n=1): + self.n += n + a = "=" * round((self.n / self.total) * self.ncols) + b = " " * (self.ncols - len(a)) + prefix = self.bar_format.split('|')[0] + sys.stderr.write("\r{}|{}=>{}| {}/{}".format(prefix, a, b, self.n, + self.total)) + sys.stderr.flush() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + sys.stderr.write('\n') diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 88dc33f581ad2..4a90ab2753142 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -102,7 +102,8 @@ function(inference_quant_int8_image_classification_test target quant_model_dir 0.1) endfunction() -# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 25 +# set batch_size 10 for UT only (avoid OOM). +# For whole dataset, use batch_size 25 function(inference_quant2_int8_image_classification_test target quant_model_dir fp32_model_dir dataset_path) py_test( @@ -127,7 +128,8 @@ function(inference_quant2_int8_image_classification_test target quant_model_dir 0.1) endfunction() -# set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 20 +# set batch_size 10 for UT only (avoid OOM). +# For whole dataset, use batch_size 20 function( inference_quant2_int8_nlp_test target @@ -284,7 +286,10 @@ if(LINUX AND WITH_MKLDNN) download_quant_model( ${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE} 95c6d01e3aeba31c13efb2ba8057d558) - # inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) + # inference_quant_int8_image_classification_test( \ + # test_quant_int8_resnet101_mkldnn \ + # ${QUANT_RESNET101_MODEL_DIR}/model \ + # ${IMAGENET_DATA_PATH}) # Quant GoogleNet set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant") @@ -321,18 +326,24 @@ if(LINUX AND WITH_MKLDNN) set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz") download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE} c37e63ca82a102f47be266f8068b0b55) - # inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) + # inference_quant_int8_image_classification_test( \ + # test_quant_int8_vgg16_mkldnn \ + # ${QUANT_VGG16_MODEL_DIR}/model \ + # ${IMAGENET_DATA_PATH}) # Quant VGG19 set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant") set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz") download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE} 62bcd4b6c3ca2af67e8251d1c96ea18f) - # inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) + # inference_quant_int8_image_classification_test( \ + # test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model \ + # ${IMAGENET_DATA_PATH}) ### Quant2 for image classification - # Quant2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators, + # Quant2 ResNet50 with input/output scales in + # `fake_quantize_moving_average_abs_max` operators, # with weight scales in `fake_dequantize_max_abs` operators set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2") set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz") @@ -345,7 +356,8 @@ if(LINUX AND WITH_MKLDNN) ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) - # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, + # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` + # operators and the `out_threshold` attributes, # with weight scales in `fake_dequantize_max_abs` operators set(QUANT2_RESNET50_RANGE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_range") @@ -358,7 +370,8 @@ if(LINUX AND WITH_MKLDNN) ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH}) - # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes, + # Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` + # operators and the `out_threshold` attributes, # with weight scales in `fake_channel_wise_dequantize_max_abs` operators set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise") diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 30cfb9f4b8591..876d4772462f5 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -49,7 +49,7 @@ def convert_dtype(dtype): return _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype] elif isinstance(dtype, type): if dtype in [ - np.bool, np.float16, np.uint16, np.float32, np.float64, np.int8, + bool, np.float16, np.uint16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64, np.uint8, np.complex64, np.complex128 ]: diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index af60776a3f1c5..92fe3fb91549b 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -374,6 +374,8 @@ def __init__(self, loader): # see _try_put_indices self._thread_lock = threading.Lock() + self._base_seed = np.random.randint(low=0, high=sys.maxsize) + # init workers and indices queues and put 2 indices in each indices queue self._init_workers() for _ in range(self._outstanding_capacity): @@ -406,7 +408,8 @@ def _init_workers(self): self._data_queue, self._workers_done_event, self._auto_collate_batch, self._collate_fn, self._drop_last, self._worker_init_fn, i, - self._num_workers, self._use_shared_memory)) + self._num_workers, self._use_shared_memory, + self._base_seed)) worker.daemon = True worker.start() self._workers.append(worker) diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py index 0c3ec898aadfd..06ea7ef9d72a3 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/fluid/dataloader/worker.py @@ -257,7 +257,7 @@ def mix(x, y): def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, auto_collate_batch, collate_fn, drop_last, init_fn, worker_id, - num_workers, use_shared_memory): + num_workers, use_shared_memory, base_seed): try: # NOTE: [ mmap files clear ] When the child process exits unexpectedly, # some shared memory objects may have been applied for but have not yet @@ -272,15 +272,20 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, try: import numpy as np import time + import random except ImportError: pass else: - np.random.seed(_generate_states(int(time.time()), worker_id)) + seed = base_seed + worker_id + random.seed(seed) + paddle.seed(seed) + np.random.seed(_generate_states(base_seed, worker_id)) global _worker_info _worker_info = WorkerInfo(id=worker_id, num_workers=num_workers, - dataset=dataset) + dataset=dataset, + seed=base_seed) init_exception = None try: diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 9da69b1e45e0b..e1ae4ad9bc5ed 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -129,11 +129,11 @@ def __init__(self, self._decr_count = 0 self._use_dynamic_loss_scaling = use_dynamic_loss_scaling - self._found_inf = to_variable(np.array([0]).astype(np.bool)) + self._found_inf = to_variable(np.array([0]).astype(np.bool_)) self._temp_found_inf_fp16 = to_variable( - np.array([0]).astype(np.bool)) + np.array([0]).astype(np.bool_)) self._temp_found_inf_fp32 = to_variable( - np.array([0]).astype(np.bool)) + np.array([0]).astype(np.bool_)) self._scale = to_variable( np.array([self._init_loss_scaling]).astype(np.float32)) self._cache_founf_inf = None diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py index de53a56468485..aa01945ac849e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py @@ -20,6 +20,7 @@ # See details in https://github.com/serge-sans-paille/gast/ import os from paddle.utils import gast +from paddle.fluid.dygraph.dygraph_to_static.early_return_transformer import EarlyReturnTransformer from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer @@ -87,6 +88,7 @@ def transfer_from_node_type(self, node_wrapper): self.visit(node_wrapper.node) transformers = [ + EarlyReturnTransformer, BasicApiTransformer, # Basic Api TensorShapeTransformer, # Tensor.shape -> layers.shape(Tensor) ListTransformer, # List used in control flow diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py new file mode 100644 index 0000000000000..bef1efb0427cf --- /dev/null +++ b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py @@ -0,0 +1,88 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from paddle.utils import gast +from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper + + +class EarlyReturnTransformer(gast.NodeTransformer): + """ + Transform if/else return statement of Dygraph into Static Graph. + """ + + def __init__(self, wrapper_root): + assert isinstance( + wrapper_root, AstNodeWrapper + ), "Type of input node should be AstNodeWrapper, but received %s ." % type( + wrapper_root) + self.root = wrapper_root.node + + def transform(self): + """ + Main function to transform AST. + """ + self.visit(self.root) + + def is_define_return_in_if(self, node): + assert isinstance( + node, gast.If + ), "Type of input node should be gast.If, but received %s ." % type( + node) + for child in node.body: + if isinstance(child, gast.Return): + return True + return False + + def visit_block_nodes(self, nodes): + result_nodes = [] + destination_nodes = result_nodes + for node in nodes: + rewritten_node = self.visit(node) + + if isinstance(rewritten_node, (list, tuple)): + destination_nodes.extend(rewritten_node) + else: + destination_nodes.append(rewritten_node) + + # append other nodes to if.orelse even though if.orelse is not empty + if isinstance(node, gast.If) and self.is_define_return_in_if(node): + destination_nodes = node.orelse + # handle stmt like `if/elif/elif` + while len(destination_nodes) > 0 and \ + isinstance(destination_nodes[0], gast.If) and \ + self.is_define_return_in_if(destination_nodes[0]): + destination_nodes = destination_nodes[0].orelse + + return result_nodes + + def visit_If(self, node): + node.body = self.visit_block_nodes(node.body) + node.orelse = self.visit_block_nodes(node.orelse) + return node + + def visit_While(self, node): + node.body = self.visit_block_nodes(node.body) + node.orelse = self.visit_block_nodes(node.orelse) + return node + + def visit_For(self, node): + node.body = self.visit_block_nodes(node.body) + node.orelse = self.visit_block_nodes(node.orelse) + return node + + def visit_FunctionDef(self, node): + node.body = self.visit_block_nodes(node.body) + return node diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 0d4acf5fe6d86..860b4e3f558ff 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -483,7 +483,7 @@ def _as_lodtensor(data, place, dtype=None): data = np.array([data]).astype(dtype) elif isinstance(data, (list, tuple)): data = np.array(data) - if data.dtype == np.object: + if data.dtype == np.object_: raise TypeError( "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually " "this means the input data contains nested lists with different lengths. " diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 44ef1ff5ae6db..2412e300a779f 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1109,7 +1109,7 @@ def convert_np_dtype_to_dtype_(np_dtype): return core.VarDesc.VarType.INT16 elif dtype == np.int64: return core.VarDesc.VarType.INT64 - elif dtype == np.bool: + elif dtype == np.bool_: return core.VarDesc.VarType.BOOL elif dtype == np.uint16: # since there is still no support for bfloat16 in NumPy, diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 47199fcd1adbe..f09097b57bd71 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -1164,10 +1164,11 @@ def calculate_gain(nonlinearity, param=None): Examples: .. code-block:: python - + :name: code-example1 import paddle gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3 gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2)) + initializer = paddle.nn.initializer.Orthogonal(gain) """ if param is None: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2c3cb903d83ca..d7f0feb103c5f 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -12860,8 +12860,8 @@ def logical_or(x, y, out=None, name=None): import paddle import numpy as np - x_data = np.array([True, False], dtype=np.bool).reshape(2, 1) - y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2) + x_data = np.array([True, False], dtype=np.bool_).reshape(2, 1) + y_data = np.array([True, False, True, False], dtype=np.bool_).reshape(2, 2) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) res = paddle.logical_or(x, y) @@ -12905,8 +12905,8 @@ def logical_xor(x, y, out=None, name=None): import paddle import numpy as np - x_data = np.array([True, False], dtype=np.bool).reshape([2, 1]) - y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2]) + x_data = np.array([True, False], dtype=np.bool_).reshape([2, 1]) + y_data = np.array([True, False, True, False], dtype=np.bool_).reshape([2, 2]) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) res = paddle.logical_xor(x, y) diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index ff299bcca9ba3..c590d69a621de 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -145,7 +145,7 @@ def __next__(self): @classmethod def _check_input_array(cls, item): arr = np.asarray(item) - if arr.dtype == np.object: + if arr.dtype == np.object_: raise TypeError( "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually " "this means the input data contains nested lists with different lengths. " diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 34971cf11941f..0bbb34434e843 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -367,7 +367,7 @@ if(APPLE) if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_program_code) - endif(NOT WITH_DISTRIBUTE) + endif() message( WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext_*" @@ -683,7 +683,7 @@ endif() foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() set_tests_properties(test_logcumsumexp_op PROPERTIES TIMEOUT 30) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) @@ -873,8 +873,8 @@ if(WITH_DISTRIBUTE) test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS}) endif() - endif(NOT WIN32) - endif(NOT APPLE) + endif() + endif() if(WITH_DGC) # if with dgc, test all dgc tests. # NOTE. dist dgc tests is already in DIST_TEST_OPS @@ -938,7 +938,7 @@ if(WITH_DISTRIBUTE) message( FATAL_ERROR "available ports have been exhausted:${dist_ut_port}") endif() - endforeach(TEST_OP) + endforeach() # solve it later. bash_test_modules( test_fleet_launch_ps @@ -974,7 +974,7 @@ if(WITH_DISTRIBUTE) "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) endif() - endif(NOT APPLE) + endif() endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf) diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt index 4fd16354e6c1a..b48b833b94602 100644 --- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt @@ -10,7 +10,7 @@ list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_sharding") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() if(WITH_DISTRIBUTE) if(WITH_GPU diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py index f7a1a28aa91ca..ae69ee087686a 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py @@ -96,10 +96,11 @@ def forward(self, input): PP_MESH_1})(out)[0] out = self.dropout(out) out = self.linear2(out) + self.out = out return out -def train(): +def train(fetch): mlp = MLPLayer(hidden_size=hidden_size, intermediate_size=4 * hidden_size, dropout_ratio=0.1, @@ -118,7 +119,6 @@ def train(): dist_strategy.amp = False dist_strategy.pipeline = False dist_strategy.recompute = False - # init parallel optimizer dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) @@ -129,20 +129,26 @@ def train(): strategy=dist_strategy) engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy()) + # fetch + if fetch: + fetches = {'out': mlp.out} + else: + fetches = None + # train train_dataset = MyDataset(batch_num * batch_size) engine.fit(train_dataset, batch_size=batch_size, steps_per_epoch=batch_num * batch_size, - fetch_list=['label']) + fetches=fetches) # eval eval_dataset = MyDataset(batch_size) - engine.evaluate(eval_dataset, batch_size, fetch_list=['label']) + engine.evaluate(eval_dataset, batch_size, fetches=fetches) # predict test_dataset = MyDataset(batch_size) - engine.predict(test_dataset, batch_size, fetch_list=['label']) + engine.predict(test_dataset, batch_size, fetches=fetches) # save temp_dir = tempfile.TemporaryDirectory() @@ -152,4 +158,5 @@ def train(): if __name__ == "__main__": - train() + train(fetch=True) + train(fetch=False) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index b5ebeb659a649..c2ccad7dd24f0 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -7,7 +7,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) -endforeach(TEST_OP) +endforeach() set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 160) set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160) diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt index 29e528edce914..51f298eccdbe2 100644 --- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt @@ -27,4 +27,4 @@ foreach(TEST_OP ${TEST_OPS}) list(APPEND DIST_TEST_OPS ${TEST_OP}) set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 120) set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST") -endforeach(TEST_OP) +endforeach() diff --git a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt index e3bf89c48821a..95739040ef4af 100644 --- a/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt @@ -6,4 +6,4 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt index f9a1e83d381fd..1687b277ab5b5 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt @@ -32,8 +32,8 @@ set(TEST_EAGER_OPS test_simnet test_transformer) list(REMOVE_ITEM TEST_OPS test_lac) -# NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope will -# be removed and will cause some random failed in multi-thread. +# NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope +# will be removed and will cause some random failed in multi-thread. if(NOT ON_INFER) py_test_modules(test_lac MODULES test_lac ENVS FLAGS_enable_eager_mode=1) set_tests_properties(test_lac PROPERTIES TIMEOUT 120) @@ -51,7 +51,7 @@ foreach(TEST_OP ${TEST_OPS}) else() py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) endif() -endforeach(TEST_OP) +endforeach() set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900) set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py index 0c7d2903c3625..39565044e7fd1 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py @@ -100,6 +100,30 @@ def false_fn_0(q, x, y): return x +def dyfunc_with_if_else_early_return1(): + x = paddle.to_tensor([10]) + if x == 0: + a = paddle.zeros([2, 2]) + b = paddle.zeros([3, 3]) + return a, b + a = paddle.zeros([2, 2]) + 1 + return a + + +def dyfunc_with_if_else_early_return2(): + x = paddle.to_tensor([10]) + if x == 0: + a = paddle.zeros([2, 2]) + b = paddle.zeros([3, 3]) + return a, b + elif x == 1: + c = paddle.zeros([2, 2]) + 1 + d = paddle.zeros([3, 3]) + 1 + return c, d + e = paddle.zeros([2, 2]) + 3 + return e + + def dyfunc_with_if_else_with_list_geneator(x): if 10 > 5: y = paddle.add_n( diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index cbc6e3c540f9f..cf8be6640300e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -29,7 +29,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code import paddle.jit.dy2static as _jst -from ifelse_simple_func import dyfunc_with_if_else +from ifelse_simple_func import dyfunc_with_if_else, dyfunc_with_if_else_early_return1, dyfunc_with_if_else_early_return2 np.random.seed(0) @@ -83,34 +83,22 @@ def false_fn_0(x_v): x_v = _jst.convert_ifelse( fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ), (x_v, )) - __return_0 = _jst.create_bool_as_type(label is not None, False) - def true_fn_1(__return_0, __return_value_0, label, x_v): + def true_fn_1(__return_value_0, label, x_v): loss = fluid.layers.cross_entropy(x_v, label) __return_0 = _jst.create_bool_as_type(label is not None, True) __return_value_0 = loss - return __return_0, __return_value_0 - - def false_fn_1(__return_0, __return_value_0): - return __return_0, __return_value_0 - - __return_0, __return_value_0 = _jst.convert_ifelse( - label is not None, true_fn_1, false_fn_1, - (__return_0, __return_value_0, label, x_v), - (__return_0, __return_value_0)) - - def true_fn_2(__return_0, __return_value_0, x_v): - __return_1 = _jst.create_bool_as_type( - _jst.convert_logical_not(__return_0), True) - __return_value_0 = x_v return __return_value_0 - def false_fn_2(__return_value_0): + def false_fn_1(__return_value_0, label, x_v): + __return_1 = _jst.create_bool_as_type(label is not None, True) + __return_value_0 = x_v return __return_value_0 - __return_value_0 = _jst.convert_ifelse( - _jst.convert_logical_not(__return_0), true_fn_2, false_fn_2, - (__return_0, __return_value_0, x_v), (__return_value_0, )) + __return_value_0 = _jst.convert_ifelse(label is not None, true_fn_1, + false_fn_1, + (__return_value_0, label, x_v), + (__return_value_0, label, x_v)) return __return_value_0 @@ -123,45 +111,33 @@ def dyfunc_with_if_else(x_v, label=None): name='__return_value_init_1') __return_value_1 = __return_value_init_1 - def true_fn_3(x_v): + def true_fn_2(x_v): x_v = x_v - 1 return x_v - def false_fn_3(x_v): + def false_fn_2(x_v): x_v = x_v + 1 return x_v x_v = _jst.convert_ifelse( - fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ), + fluid.layers.mean(x_v)[0] > 5, true_fn_2, false_fn_2, (x_v, ), (x_v, )) - __return_2 = _jst.create_bool_as_type(label is not None, False) - def true_fn_4(__return_2, __return_value_1, label, x_v): + def true_fn_3(__return_value_1, label, x_v): loss = fluid.layers.cross_entropy(x_v, label) __return_2 = _jst.create_bool_as_type(label is not None, True) __return_value_1 = loss - return __return_2, __return_value_1 - - def false_fn_4(__return_2, __return_value_1): - return __return_2, __return_value_1 - - __return_2, __return_value_1 = _jst.convert_ifelse( - label is not None, true_fn_4, false_fn_4, - (__return_2, __return_value_1, label, x_v), - (__return_2, __return_value_1)) - - def true_fn_5(__return_2, __return_value_1, x_v): - __return_3 = _jst.create_bool_as_type( - _jst.convert_logical_not(__return_2), True) - __return_value_1 = x_v return __return_value_1 - def false_fn_5(__return_value_1): + def false_fn_3(__return_value_1, label, x_v): + __return_3 = _jst.create_bool_as_type(label is not None, True) + __return_value_1 = x_v return __return_value_1 - __return_value_1 = _jst.convert_ifelse( - _jst.convert_logical_not(__return_2), true_fn_5, false_fn_5, - (__return_2, __return_value_1, x_v), (__return_value_1, )) + __return_value_1 = _jst.convert_ifelse(label is not None, true_fn_3, + false_fn_3, + (__return_value_1, label, x_v), + (__return_value_1, label, x_v)) return __return_value_1 @@ -358,6 +334,21 @@ def test_raise_error(self): net.foo.train() +class TestIfElseEarlyReturn(unittest.TestCase): + + def test_ifelse_early_return1(self): + answer = np.zeros([2, 2]) + 1 + static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return1) + out = static_func() + self.assertTrue(np.allclose(answer, out.numpy())) + + def test_ifelse_early_return2(self): + answer = np.zeros([2, 2]) + 3 + static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return2) + out = static_func() + self.assertTrue(np.allclose(answer, out.numpy())) + + class TestRemoveCommentInDy2St(unittest.TestCase): def func_with_comment(self): diff --git a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt index e3bf89c48821a..95739040ef4af 100644 --- a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt @@ -6,4 +6,4 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py index a3c62323c2c20..f386fdc9c3460 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_fft.py +++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py @@ -44,8 +44,8 @@ def rand_x(dims=1, complex=False): shape = [np.random.randint(min_dim_len, max_dim_len) for i in range(dims)] if complex: - return np.random.randn(*shape).astype( - dtype) + 1.j * np.random.randn(*shape).astype(dtype) + return np.random.randn( + *shape).astype(dtype) + 1.j * np.random.randn(*shape).astype(dtype) else: return np.random.randn(*shape).astype(dtype) @@ -473,7 +473,7 @@ def test_irfft2(self): @parameterize((TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [ ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, -1, 'backward', NotImplementedError), + np.bool_), None, -1, 'backward', NotImplementedError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError), ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1, @@ -543,7 +543,7 @@ def test_irfft(self): (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, (-2, -1), 'backward', NotImplementedError), + np.bool_), None, (-2, -1), 'backward', NotImplementedError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), @@ -625,7 +625,7 @@ def test_irfft2(self): (TEST_CASE_NAME, 'x', 'n', 'axis', 'norm', 'expect_exception'), [('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, (-2, -1), 'backward', NotImplementedError), + np.bool_), None, (-2, -1), 'backward', NotImplementedError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py index ce0a623aea076..ddf47065bb01d 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py +++ b/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py @@ -370,7 +370,7 @@ def test_static_irfft2(self): 4), None, -1, 'backward', TypeError), ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, -1, 'backward', TypeError), + np.bool_), None, -1, 'backward', TypeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError), ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1, @@ -406,7 +406,7 @@ def test_static_hfft(self): 4), None, -1, 'backward', TypeError), ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, -1, 'backward', TypeError), + np.bool_), None, -1, 'backward', TypeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), -1, -1, 'backward', ValueError), ('test_n_zero', np.random.randn(4, 4) + 1j * np.random.randn(4, 4), 0, -1, @@ -444,7 +444,7 @@ def test_static_irfft(self): 4, 4, 4), None, None, 'backward', TypeError), ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, (-2, -1), 'backward', TypeError), + np.bool_), None, (-2, -1), 'backward', TypeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), @@ -485,7 +485,7 @@ def test_static_hfft2(self): 4, 4, 4), None, None, 'backward', TypeError), ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, (-2, -1), 'backward', TypeError), + np.bool_), None, (-2, -1), 'backward', TypeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), @@ -526,7 +526,7 @@ def test_static_irfft2(self): 4, 4, 4), None, None, 'backward', TypeError), ('test_bool_input', (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4)).astype( - np.bool8), None, (-2, -1), 'backward', TypeError), + np.bool_), None, (-2, -1), 'backward', TypeError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), @@ -568,7 +568,7 @@ def test_static_hfftn(self): 4, 4, 4), None, None, 'backward', TypeError), # ('test_bool_input', # (np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4) - # ).astype(np.bool8), None, (-2, -1), 'backward', ValueError), + # ).astype(np.bool_), None, (-2, -1), 'backward', ValueError), ('test_n_nagative', np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), (-1, -2), (-2, -1), 'backward', ValueError), diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt index 976a36b761568..c60a7511022b4 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt @@ -16,6 +16,7 @@ foreach(target ${TEST_INTERP_CASES}) FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0) + py_test_modules( ${target}_non_eager_deletion MODULES @@ -25,6 +26,7 @@ foreach(target ${TEST_INTERP_CASES}) FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=false FLAGS_eager_delete_tensor_gb=0.000001) + py_test_modules( ${target}_fast_gc MODULES @@ -34,6 +36,7 @@ foreach(target ${TEST_INTERP_CASES}) FLAGS_use_stream_safe_cuda_allocator=true FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0) + py_test_modules( ${target}_fast_gc_non_eager_deletion MODULES @@ -44,3 +47,11 @@ foreach(target ${TEST_INTERP_CASES}) FLAGS_fast_eager_deletion_mode=true FLAGS_eager_delete_tensor_gb=0.000001) endforeach() + +py_test_modules( + test_standalone_executor_sequential_run MODULES test_standalone_executor ENVS + FLAGS_new_executor_sequential_run=true) + +py_test_modules( + test_standalone_executor_serial_run MODULES test_standalone_executor ENVS + FLAGS_new_executor_serial_run=true) diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py index eeddcaa5bb534..5ce035097d01a 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py @@ -81,7 +81,9 @@ def _run(self, feed): return ret def run_raw_executor(self, feed): + os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0' out = self._run(feed) + del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] print("GT:", out) return out diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index 7faff7ec18193..9e375126550cc 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -231,10 +231,6 @@ def test_result(self): for gt, out in zip(ground_truths, res): self.assertEqual(gt[0], out[0]) - res_sequential = self.run_new_executor_sequential() - for gt, out in zip(ground_truths, res_sequential): - self.assertEqual(gt[0], out[0]) - def run_raw_executor(self): paddle.seed(2020) main_program, startup_program, fetch_list = build_program() @@ -264,12 +260,6 @@ def run_new_executor(self): np.array(inter_core.run({}, fetch_list)._move_to_list()[0])) return outs - def run_new_executor_sequential(self): - os.environ['FLAGS_new_executor_sequential_run'] = '1' - res = self.run_new_executor() - del os.environ['FLAGS_new_executor_sequential_run'] - return res - class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase): diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py index 8006c59d2ba12..a4d18d29be44c 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_multiply_write.py @@ -36,8 +36,8 @@ def _get_feed(self): return None def build_program(self): - main_program = paddle.static.default_main_program() - startup_program = paddle.static.default_startup_program() + main_program = Program() + startup_program = Program() with paddle.static.program_guard(main_program, startup_program): out = paddle.full((1, ), 1) inp1 = paddle.full((1, ), 2) diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt index 6b709d85d75c3..0174274827358 100644 --- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt @@ -9,7 +9,7 @@ if(WITH_IPU) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) # set all UTs timeout to 200s set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200) - endforeach(TEST_OP) + endforeach() set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300) set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300) diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh new file mode 100644 index 0000000000000..a4221b37eb14f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/distributed/run_dist_ipu.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +partition_name=pod64 +vipu_server=10.137.96.62 +allclose_script=" +import sys +import numpy as np +data1 = np.loadtxt(\"ipu_res.txt\") +data2 = np.loadtxt(\"cpu_res.txt\") +if np.allclose(data1[::16], data2, atol=1e-6): + sys.exit(0) +else: + sys.exit(1) +" + +for opt in lamb sgd adam ; +do + for onchip in False True ; + do + for rts in False True ; + do + echo "Testcase: opt: ${opt}, onchip: ${onchip}, rts: ${rts}" + echo "paddle.distributed.fleet.launch test with IPUs..." + python3.7 -m paddle.distributed.launch \ + --devices=8 \ + ipu \ + --hosts=localhost \ + --nproc_per_host=2 \ + --ipus_per_replica=2 \ + --ipu_partition=${partition_name} \ + --vipu_server=${vipu_server} \ + test_dist_data_parallel_ipu.py ${opt} ipu_res.txt ${onchip} ${rts} > ipu.log + echo "paddle.distributed.fleet.launch test with IPUs...Done" + + echo "paddle normal test with CPU..." + export POPLAR_IPUMODEL=1 + python3.7 test_dist_data_parallel_ipu.py ${opt} cpu_res.txt > cpu.log + unset POPLAR_IPUMODEL + echo "paddle normal test with CPU...Done" + + echo "Compare results..." + python3.7 -c """${allclose_script}""" + if [ $? -eq 0 ];then + echo "Compare results...Done" + else + echo "Error occurs. Please check ipu.log, cpu.log, ipu_res.txt and cpu_res.txt" + exit 0 + fi + done + done +done + +if [ -f "ipu.log" ]; then + rm "ipu.log" +fi +if [ -f "cpu.log" ]; then + rm "cpu.log" +fi +if [ -f "ipu_res.txt" ]; then + rm "ipu_res.txt" +fi +if [ -f "cpu_res.txt" ]; then + rm "cpu_res.txt" +fi diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py new file mode 100644 index 0000000000000..891aa501c5079 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_data_parallel_ipu.py @@ -0,0 +1,193 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import sys +import os +import random +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + +mpi_comm = None + + +@unittest.skip('Disable distributed tests on auto CI.') +class TestBase(IPUOpTest): + + def set_attrs(self, enable_ipu, optimizer, log, onchip=False, rts=False): + self.ipu_options = { + "enable_pipelining": True, + "batches_per_step": 1, + "enable_gradient_accumulation": True, + "accumulation_factor": 4, + "enable_replicated_graphs": True, + "replicated_graph_count": 2, + "location_optimizer": { + "on_chip": onchip, + "use_replicated_tensor_sharding": rts + } + } + + self.cpu_bs = 16 + self.ipu_bs = 1 + self.optimizer = optimizer + self.log = log + self.enable_ipu = enable_ipu + + def test(self): + seed = 2021 + np.random.seed(seed) + random.seed(seed) + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = seed + startup_prog.random_seed = seed + + bs = self.ipu_bs if self.enable_ipu else self.cpu_bs + data = np.random.rand(1, 3, 10, 10).astype(np.float32) + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + image = paddle.static.data(name='image', + shape=[bs, 3, 10, 10], + dtype='float32') + with paddle.static.ipu_shard_guard(index=0, stage=0): + conv1 = paddle.static.nn.conv2d(image, + num_filters=3, + filter_size=3, + bias_attr=False) + with paddle.static.ipu_shard_guard(index=1, stage=1): + conv2 = paddle.static.nn.conv2d(conv1, + num_filters=3, + filter_size=3, + bias_attr=False) + # should consider influence of bs + loss = paddle.mean(conv2) + + if self.optimizer == 'sgd': + opt = paddle.optimizer.SGD(learning_rate=1e-2) + elif self.optimizer == 'adam': + opt = paddle.optimizer.Adam(learning_rate=1e-2) + elif self.optimizer == 'lamb': + opt = paddle.optimizer.Lamb(learning_rate=1e-2) + else: + raise Exception('optimizer must be sgd, adam or lamb') + + opt.minimize(loss) + + if self.enable_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + executor = paddle.static.Executor(place) + executor.run(startup_prog) + + if self.enable_ipu: + feed_list = [image.name] + fetch_list = [loss.name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config( + num_ipus=2 * self.ipu_options['replicated_graph_count'], + is_training=True, + enable_manual_shard=True) + ipu_strategy.set_options(self.ipu_options) + ipu_strategy.set_options({ + "enable_distribution": + True, + "enable_distributed_replicated_graphs": + True, + "global_replica_offset": + int(os.environ.get("PADDLE_TRAINER_ID")) * 2, + "global_replication_factor": + 4 + }) + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile( + feed_list, fetch_list) + feed = { + "image": + np.tile(data, [ + self.ipu_options['replicated_graph_count'] * + self.ipu_options['batches_per_step'] * + self.ipu_options['accumulation_factor'], 1, 1, 1 + ]) + } + + else: + program = main_prog + feed = {"image": np.tile(data, [self.cpu_bs, 1, 1, 1])} + + epoch = 10 + if not self.enable_ipu: + # global replication factor + epoch *= 4 + epoch *= self.ipu_options['batches_per_step'] + epoch *= self.ipu_options['accumulation_factor'] + epoch = epoch / (self.cpu_bs / self.ipu_bs) + + results = [] + for i in range(int(epoch)): + res = executor.run(program, feed=feed, fetch_list=[loss]) + if self.enable_ipu: + res = mpi_comm.gather(res, root=0) + results.append(res) + if self.enable_ipu: + if int(os.environ.get("PADDLE_TRAINER_ID")) == 0: + np.savetxt(self.log, np.array(results).flatten()) + else: + np.savetxt(self.log, np.array(results).flatten()) + + +if __name__ == "__main__": + paddle.enable_static() + # Run distributed tests + if len(sys.argv) == 5: + from mpi4py import MPI + + DISTRIBUTED_COMM = MPI.COMM_WORLD + + def _get_comm(): + global DISTRIBUTED_COMM + if DISTRIBUTED_COMM is None: + raise RuntimeError( + "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first." + ) + return DISTRIBUTED_COMM + + mpi_comm = _get_comm() + + optimizer = sys.argv[1] + log = sys.argv[2] + onchip = True if sys.argv[3] == "True" else False + rts = True if sys.argv[4] == "True" else False + test = TestBase() + test.set_attrs(enable_ipu=True, + optimizer=optimizer, + log=log, + onchip=onchip, + rts=rts) + test.test() + # Run cpu tests for compare + elif len(sys.argv) == 3: + test = TestBase() + test.set_attrs(enable_ipu=False, optimizer=sys.argv[1], log=sys.argv[2]) + test.test() + else: + raise ValueError( + "Only support 3 or 5 args. 3 for cpu test, 5 for ipu distributed test" + ) diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py new file mode 100644 index 0000000000000..f81ed48f04ffd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_pod128_sample.py @@ -0,0 +1,115 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +python3.7 -m paddle.distributed.launch \ +--devices=128 \ +ipu \ +--hosts=host1,host2 \ +--ipus_per_host=2 \ +--nproc_per_host=1 \ +--ipu_partition=pod128 \ +--vipu_server=lr17-1-ctrl \ +python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py +Equal to: +poprun \ +--host=localhost,host2 \ +--num-instances=2 \ +--num-replicas=64 \ +--ipus-per-replica=2 \ +--print-topology=yes \ +--vipu-partition=pod128_bert \ +--vipu-server-host=lr17-1-ctrl \ +--update-partition=yes \ +python3.7 python/paddle/fluid/tests/unittests/ipu/disabled/test_dist_pod128_ipu.py +''' + +import os +import numpy as np +import paddle + + +def TestDistTraining(): + paddle.enable_static() + + attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'} + + scope = paddle.fluid.core.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = 42 + startup_prog.random_seed = 42 + + np.random.seed(42) + input_data = np.random.uniform(0, 127, size=[128, 3, 2, 1]).astype(np.int32) + + with paddle.fluid.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64') + with paddle.static.ipu_shard_guard(index=0, stage=0): + out = paddle.fluid.layers.embedding(x, **attrs) + with paddle.static.ipu_shard_guard(index=1, stage=1): + loss = paddle.mean(out) + opt = paddle.optimizer.Adam(learning_rate=1e-1) + opt.minimize(loss) + + feed_list = ["x"] + fetch_list = [loss.name] + + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(num_ipus=64, + is_training=True, + enable_manual_shard=True) + ipu_strategy.set_pipelining_config( + enable_pipelining=True, + batches_per_step=1, + enable_gradient_accumulation=True, + accumulation_factor=4) + ipu_strategy.set_options({ + "enable_distribution": + True, + "enable_replicated_graphs": + True, + "replicated_graph_count": + 32, + "enable_distributed_replicated_graphs": + True, + "global_replica_offset": + # Paddle : int(os.environ.get("PADDLE_TRAINER_ID")) * 32 + # PopRun : int(os.environ.get("POPDIST_REPLICA_INDEX_OFFSET")) + int(os.environ.get("PADDLE_TRAINER_ID")) * 32, + "global_replication_factor": + 64, + "location_optimizer": { + "on_chip": False, + "use_replicated_tensor_sharding": True + } + }) + + ipu_program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy) + program = ipu_program.compile(feed_list, fetch_list) + + for i in range(10): + res = exe.run(program, + feed={"x": input_data}, + fetch_list=fetch_list) + print("index: {}, result: {}".format(i, res)) + + +if __name__ == "__main__": + TestDistTraining() diff --git a/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py new file mode 100644 index 0000000000000..d42977b5962d3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py @@ -0,0 +1,178 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Single host: +python3.7 -m paddle.distributed.launch \ +--devices=4 \ +ipu \ +--hosts=localhost \ +--nproc_per_host=2 \ +--ipus_per_replica=1 \ +--ipu_partition=pod64 \ +--vipu_server=10.137.96.62 \ +python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py +Equal to: +poprun \ +--host=localhost \ +--num-instances=2 \ +--num-replicas=4 \ +--ipus-per-replica=1 \ +--print-topology=yes \ +python3.7 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py +''' +''' +Multi hosts: +python3.7 -m paddle.distributed.launch \ +--devices=4 \ +ipu \ +--hosts=host1,host2 \ +--nproc_per_host=1 \ +--ipus_per_replica=1 \ +--ipu_partition=pod64 \ +--vipu_server=10.137.96.62 \ +python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py +Equal to: +poprun \ +--host=host1,host2 \ +--num-instances=2 \ +--num-replicas=4 \ +--ipus-per-replica=1 \ +--print-topology=yes \ +python3.7 python/paddle/fluid/tests/unittests/ipu/distributed/test_dist_sample.py +''' + +import os +import sys +import paddle +import numpy as np + +mpi_comm = None + + +def Test(use_dist, file_name): + paddle.enable_static() + + attrs = {"size": [128, 16], "padding_idx": -1, "dtype": 'float32'} + + scope = paddle.fluid.core.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = 42 + startup_prog.random_seed = 42 + + with paddle.fluid.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data(name="x", shape=[3, 2, 1], dtype='int64') + + out = paddle.fluid.layers.embedding(x, **attrs) + loss = paddle.mean(out) + opt = paddle.optimizer.Adam(learning_rate=1e-1) + opt.minimize(loss) + + feed_list = ["x"] + fetch_list = [loss.name] + + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + ipu_strategy = paddle.static.IpuStrategy() + if use_dist: + ipu_strategy.set_graph_config(num_ipus=2, is_training=True) + # Set distributed envs + ipu_strategy.set_options({ + "enable_distribution": + True, + "enable_replicated_graphs": + True, + "replicated_graph_count": + 2, + "enable_distributed_replicated_graphs": + True, + "global_replica_offset": + int(os.environ.get("PADDLE_TRAINER_ID")) * 2, + "global_replication_factor": + 4 + }) + else: + ipu_strategy.set_graph_config(num_ipus=4, is_training=True) + ipu_strategy.set_options({ + "enable_replicated_graphs": True, + "replicated_graph_count": 4, + }) + + ipu_program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy) + program = ipu_program.compile(feed_list, fetch_list) + + if use_dist: + if os.environ.get("PADDLE_TRAINER_ID") == "0": + input_data = np.concatenate([ + np.array([[[1], [3]], [[2], [4]], + [[4], [127]]]).astype(np.int32), + np.array([[[1], [3]], [[2], [4]], + [[4], [127]]]).astype(np.int32) + ]) + else: + input_data = np.concatenate([ + np.array([[[8], [60]], [[50], [77]], + [[90], [13]]]).astype(np.int32), + np.array([[[8], [60]], [[50], [77]], + [[90], [13]]]).astype(np.int32) + ]) + else: + input_data = np.concatenate([ + np.array([[[1], [3]], [[2], [4]], + [[4], [127]]]).astype(np.int32), + np.array([[[1], [3]], [[2], [4]], + [[4], [127]]]).astype(np.int32), + np.array([[[8], [60]], [[50], [77]], + [[90], [13]]]).astype(np.int32), + np.array([[[8], [60]], [[50], [77]], + [[90], [13]]]).astype(np.int32) + ]) + feed_data = {"x": input_data} + + for step in range(10): + res = exe.run(program, feed=feed_data, fetch_list=fetch_list) + + if use_dist: + res = mpi_comm.gather(res) + if os.getenv("PADDLE_TRAINER_ID") == "0": + np.savetxt(file_name, np.array(res).flatten()) + else: + np.savetxt(file_name, np.array(res).flatten()) + + +if __name__ == "__main__": + file_name = sys.argv[1] + + use_dist = False + if 'PADDLE_TRAINER_ID' in os.environ: + from mpi4py import MPI + + DISTRIBUTED_COMM = MPI.COMM_WORLD + + def _get_comm(): + global DISTRIBUTED_COMM + if DISTRIBUTED_COMM is None: + raise RuntimeError( + "Distributed Commumication not setup. Please run setup_comm(MPI.COMM_WORLD) first." + ) + return DISTRIBUTED_COMM + + mpi_comm = _get_comm() + use_dist = True + + Test(use_dist, file_name) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py index 3b2034ebe836c..af03480fbf698 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py @@ -86,7 +86,7 @@ def set_data_feed(self): self.feed_fp32 = {'in_0': data.astype(np.float32)} self.feed_fp16 = {'in_0': data.astype(np.float16)} data = np.random.choice([True, False], size=(2, 3, 1)) - self.assign_bool = data.astype(np.bool) + self.assign_bool = data.astype(np.bool_) @IPUOpTest.static_graph def build_model(self): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py index f82acb204f0a2..003c84c4c5ab0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py @@ -29,10 +29,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] - - for x in attrs[0]["decrease_axis"]: - if x < 0: - return False + out_shape = list(inputs['input_data'].shape) for x in range(len(attrs[0]["axes"])): start = 0 end = 0 @@ -48,15 +45,20 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: end = attrs[0]["ends"][x] start = max(0, start) end = max(0, end) + out_shape[attrs[0]["axes"][x]] = end - start if start >= end: return False - + for x in attrs[0]["decrease_axis"]: + if x < 0: + return False + if (out_shape[x] != 1): + return False return True def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]]): - return np.ones([6, 6, 64, 64]).astype(np.float32) + return np.random.random([6, 6, 64, 64]).astype(np.float32) for axes in [[0, 1], [1, 3], [2, 3]]: for starts in [[0, 1]]: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py index 38ca6963e94b2..e8c283acc3b8f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py @@ -73,13 +73,13 @@ def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]], batch): if self.dims == 4: - return np.ones([batch, 3, 3, 24]).astype(np.float32) + return np.random.random([batch, 3, 3, 24]).astype(np.float32) elif self.dims == 3: - return np.ones([batch, 3, 24]).astype(np.float32) + return np.random.random([batch, 3, 24]).astype(np.float32) elif self.dims == 2: - return np.ones([batch, 24]).astype(np.float32) + return np.random.random([batch, 24]).astype(np.float32) elif self.dims == 1: - return np.ones([24]).astype(np.float32) + return np.random.random([24]).astype(np.float32) def generate_AxisTensor(attrs: List[Dict[str, Any]]): return np.ones([1]).astype(np.int32) @@ -162,25 +162,33 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.dims == 4: self.dynamic_shape.min_input_shape = { - "split_input": [1, 3, 3, 24] + "split_input": [1, 3 - 1, 3 - 1, 24 - 1] } self.dynamic_shape.max_input_shape = { - "split_input": [9, 3, 3, 24] + "split_input": [9, 3 + 1, 3 + 1, 24 + 1] } self.dynamic_shape.opt_input_shape = { "split_input": [1, 3, 3, 24] } elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"split_input": [1, 3, 24]} - self.dynamic_shape.max_input_shape = {"split_input": [9, 3, 24]} + self.dynamic_shape.min_input_shape = { + "split_input": [1, 3 - 1, 24 - 1] + } + self.dynamic_shape.max_input_shape = { + "split_input": [9, 3 + 1, 24 + 1] + } self.dynamic_shape.opt_input_shape = {"split_input": [1, 3, 24]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"split_input": [1, 24]} - self.dynamic_shape.max_input_shape = {"split_input": [9, 24]} + self.dynamic_shape.min_input_shape = { + "split_input": [1, 24 - 1] + } + self.dynamic_shape.max_input_shape = { + "split_input": [9, 24 + 1] + } self.dynamic_shape.opt_input_shape = {"split_input": [1, 24]} elif self.dims == 1: - self.dynamic_shape.min_input_shape = {"split_input": [24]} - self.dynamic_shape.max_input_shape = {"split_input": [24]} + self.dynamic_shape.min_input_shape = {"split_input": [24 - 1]} + self.dynamic_shape.max_input_shape = {"split_input": [24 + 1]} self.dynamic_shape.opt_input_shape = {"split_input": [24]} def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py new file mode 100644 index 0000000000000..28509d42ee30b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py @@ -0,0 +1,136 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import unittest +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertActivationTest(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(dims, batch, attrs: List[Dict[str, Any]]): + if dims == 1: + return np.random.random([32]).astype(np.float32) + elif dims == 2: + return np.random.random([3, 32]).astype(np.float32) + elif dims == 3: + return np.random.random([3, 32, 32]).astype(np.float32) + else: + return np.random.random([batch, 3, 32, 32]).astype(np.float32) + + for dims in [2, 3, 4, 5]: + for batch in [1]: + for k in [1, 3]: + self.dims = dims + dics = [{"k": k}] + ops_config = [{ + "op_type": "top_k", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["output_data"], + "Indices": ["indices_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": + TensorConfig(data_gen=partial( + generate_input1, dims, batch, dics)) + }, + outputs=["output_data", "indices_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 16, 16] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 32, 32] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 32, 32] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 4 + return 1, 3 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + ## for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py new file mode 100644 index 0000000000000..651cc00d2cd7a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py @@ -0,0 +1,153 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import unittest +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertActivationTest(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + if len(inputs['input_data'].shape) <= attrs[0]['axis']: + return False + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(dims, batch, attrs: List[Dict[str, Any]]): + if dims == 1: + return np.random.random([3]).astype(np.float32) + elif dims == 2: + return np.random.random([3, 32]).astype(np.float32) + elif dims == 3: + return np.random.random([3, 32, 32]).astype(np.float32) + else: + return np.random.random([batch, 32, 32, 32]).astype(np.float32) + + for dims in [1, 2, 3, 4]: + for batch in [1, 4]: + for k in [1, 3]: + for axis in [-1, 1, 2, 3]: + for largest in [True, False]: + for sort in [True, False]: + self.dims = dims + self.sort = sort + dics = [{ + "k": k, + "axis": axis, + "largest": largest, + "sorted": sort + }] + ops_config = [{ + "op_type": "top_k_v2", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["output_data"], + "Indices": ["indices_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": + TensorConfig(data_gen=partial( + generate_input1, dims, batch, dics)) + }, + outputs=["output_data", "indices_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + if self.dims == 1: + self.dynamic_shape.min_input_shape = {"input_data": [1]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"input_data": [1, 1]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 10]} + elif self.dims == 3: + self.dynamic_shape.min_input_shape = {"input_data": [1, 1, 1]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 10, 10]} + else: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 16, 16] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 32, 32, 32] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 32, 32] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 4 + if self.sort == False: + return 0, 4 + return 1, 3 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt index 7ed1529ea4c6b..56ad5f710163a 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt @@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120) set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120) set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py index 67d06e7b22c1b..04d6be1300154 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py @@ -64,7 +64,7 @@ def init_data(self): B_data = np.random.uniform(-127, 127, (5, 20)).astype(np.float32) - quant_B = np.round(B_data * self.scale_y[0]).astype(np.int) + quant_B = np.round(B_data * self.scale_y[0]).astype(np.int_) output = np.dot(A_data, quant_B) scale_output_shift = (self.scale_out) / \ @@ -136,7 +136,7 @@ def init_data(self): A_data_reshape = A_data.reshape(3 * 4, 4 * 3) B_data_reshape = B_data.reshape(2 * 6, 1 * 2 * 3) - quant_B = np.round(B_data_reshape * self.scale_y[0]).astype(np.int) + quant_B = np.round(B_data_reshape * self.scale_y[0]).astype(np.int_) output = np.dot(A_data_reshape, quant_B) scale_output_shift = (self.scale_out) / \ diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py index ca61f961b7a0a..9986726b3a601 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py @@ -47,7 +47,7 @@ def setUp(self): self.shape = self.get_x_shape() self.axis = self.get_axis() - x = np.random.uniform(0.1, 1, self.shape).astype(np.float) + x = np.random.uniform(0.1, 1, self.shape).astype(np.float64) out = convert_float_to_uint16( np.apply_along_axis(stable_softmax, self.axis, x)) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py new file mode 100644 index 0000000000000..b8c31578099e1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py @@ -0,0 +1,663 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys + +sys.path.append('..') +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.nn.functional import interpolate +import paddle + +paddle.enable_static() + + +def bilinear_interp_np(input, + out_h, + out_w, + scale_w=0, + scale_h=0, + out_size=None, + actual_shape=None, + align_corners=True, + align_mode=0, + data_layout='NCHW'): + """bilinear interpolation implement in shape [N, C, H, W]""" + if data_layout == "NHWC": + input = np.transpose(input, (0, 3, 1, 2)) # NHWC => NCHW + if out_size is not None: + out_h = out_size[0] + out_w = out_size[1] + if actual_shape is not None: + out_h = actual_shape[0] + out_w = actual_shape[1] + batch_size, channel, in_h, in_w = input.shape + + ratio_h = ratio_w = 0.0 + if out_h > 1: + if (align_corners): + ratio_h = (in_h - 1.0) / (out_h - 1.0) + else: + if scale_h > 0: + ratio_h = 1.0 / scale_h + else: + ratio_h = 1.0 * in_h / out_h + if out_w > 1: + if (align_corners): + ratio_w = (in_w - 1.0) / (out_w - 1.0) + else: + if scale_w > 0: + ratio_w = 1.0 / scale_w + else: + ratio_w = 1.0 * in_w / out_w + + out = np.zeros((batch_size, channel, out_h, out_w)) + + for i in range(out_h): + if (align_mode == 0 and not align_corners): + h = int(ratio_h * (i + 0.5) - 0.5) + else: + h = int(ratio_h * i) + + h = max(0, h) + hid = 1 if h < in_h - 1 else 0 + if (align_mode == 0 and not align_corners): + idx_src_h = max(ratio_h * (i + 0.5) - 0.5, 0) + h1lambda = idx_src_h - h + else: + h1lambda = ratio_h * i - h + h2lambda = 1.0 - h1lambda + for j in range(out_w): + if (align_mode == 0 and not align_corners): + w = int(ratio_w * (j + 0.5) - 0.5) + else: + w = int(ratio_w * j) + w = max(0, w) + wid = 1 if w < in_w - 1 else 0 + if (align_mode == 0 and not align_corners): + idx_src_w = max(ratio_w * (j + 0.5) - 0.5, 0) + w1lambda = idx_src_w - w + else: + w1lambda = ratio_w * j - w + w2lambda = 1.0 - w1lambda + + out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] + + w1lambda*input[:, :, h, w+wid]) + \ + h1lambda*(w2lambda*input[:, :, h+hid, w] + + w1lambda*input[:, :, h+hid, w+wid]) + + if data_layout == "NHWC": + out = np.transpose(out, (0, 2, 3, 1)) # NCHW => NHWC + + return out.astype(input.dtype) + + +class TestBilinearInterpOp(OpTest): + + def setUp(self): + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + self.out_size = None + self.actual_shape = None + self.data_layout = 'NCHW' + self.init_test_case() + self.dtype = "float32" + self.op_type = "bilinear_interp_v2" + input_np = np.random.random(self.input_shape).astype(self.dtype) + + if self.data_layout == "NCHW": + in_h = self.input_shape[2] + in_w = self.input_shape[3] + else: + in_h = self.input_shape[1] + in_w = self.input_shape[2] + scale_h = 0 + scale_w = 0 + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0.: + scale_h = scale_w = float(self.scale) + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_w = scale_h = self.scale[0] + elif isinstance(self.scale, list) and len(self.scale) > 1: + scale_w = self.scale[1] + scale_h = self.scale[0] + out_h = int(in_h * scale_h) + out_w = int(in_w * scale_w) + else: + out_h = self.out_h + out_w = self.out_w + + output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0, + self.out_size, self.actual_shape, + self.align_corners, self.align_mode, + self.data_layout) + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + if self.actual_shape is not None: + self.inputs['OutSize'] = self.actual_shape + + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'align_mode': self.align_mode, + 'data_layout': self.data_layout + } + + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0.: + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + self.scale = [self.scale[0], self.scale[0]] + self.attrs['scale'] = self.scale + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 5] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 3]).astype("int32") + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpCase1(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpCase2(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpCase3(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.align_corners = True + self.align_mode = 1 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestBilinearInterpCase4(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.out_size = np.array([2, 2]).astype("int32") + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpCase5(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.out_size = np.array([11, 11]).astype("int32") + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpCase6(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([65, 33]).astype("int32") + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpCase7(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.scale = [2.0, 0.5] + self.align_corners = False + self.align_mode = 1 + + +class TestBilinearInterpSame(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 32, 64] + self.out_h = 32 + self.out_w = 64 + self.scale = 0. + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpActualShape(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpDataLayout(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 5, 5, 3] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 3]).astype("int32") + self.align_corners = True + self.align_mode = 1 + self.data_layout = "NHWC" + + +class TestBilinearInterpOtherMethod1(TestBilinearInterpOp): + + def set_align_mode(self): + self.align_corners = False + self.align_mode = 1 + + +class TestBilinearInterpWithMethod2(TestBilinearInterpOp): + + def set_align_mode(self): + self.align_corners = False + self.align_mode = 0 + + +class TestBilinearInterpWithMethod3(TestBilinearInterpOp): + + def set_align_mode(self): + self.align_corners = True + self.align_mode = 0 + + +class TestBilinearInterpScale1(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 7] + self.out_h = 60 + self.out_w = 25 + self.scale = 2. + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpScale2(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 7] + self.out_h = 60 + self.out_w = 25 + self.scale = 1. + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpScale3(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 7] + self.out_h = 60 + self.out_w = 25 + self.scale = 1.5 + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpScale4(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 7] + self.out_h = 60 + self.out_w = 25 + self.scale = [1.5, 0.5] + self.align_corners = True + self.align_mode = 1 + + +class TestBilinearInterpZero(TestBilinearInterpOp): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 7] + self.out_h = 60 + self.out_w = 25 + self.scale = 0.2 + self.align_corners = False + self.align_mode = 0 + + +class TestBilinearInterpOp_attr_tensor(OpTest): + + def setUp(self): + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + self.out_size = None + self.actual_shape = None + self.init_test_case() + self.op_type = "bilinear_interp_v2" + self.shape_by_1Dtensor = False + self.scale_by_1Dtensor = False + self.attrs = { + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + } + + input_np = np.random.random(self.input_shape).astype("float32") + self.inputs = {'X': input_np} + + if self.scale_by_1Dtensor: + self.inputs['Scale'] = np.array([self.scale]).astype("float32") + elif self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_h = scale_w = float(self.scale) + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_w = scale_h = self.scale[0] + elif isinstance(self.scale, list) and len(self.scale) > 1: + scale_w = self.scale[1] + scale_h = self.scale[0] + out_h = int(self.input_shape[2] * scale_h) + out_w = int(self.input_shape[3] * scale_w) + else: + out_h = self.out_h + out_w = self.out_w + + if self.shape_by_1Dtensor: + self.inputs['OutSize'] = self.out_size + elif self.out_size is not None: + size_tensor = [] + for index, ele in enumerate(self.out_size): + size_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + self.inputs['SizeTensor'] = size_tensor + + self.attrs['out_h'] = self.out_h + self.attrs['out_w'] = self.out_w + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + self.scale = [self.scale[0], self.scale[0]] + self.attrs['scale'] = self.scale + output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0, + self.out_size, self.actual_shape, + self.align_corners) + self.outputs = {'Out': output_np} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [2, 3, 5, 5] + self.out_h = 3 + self.out_w = 3 + self.scale = 0. + self.out_size = [3, 3] + self.align_corners = True + + +# out_size is a 1-D tensor +class TestBilinearInterp_attr_tensor_Case1(TestBilinearInterpOp_attr_tensor): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.scale = 0. + self.out_size = [8, 12] + self.align_corners = True + + +# scale is a 1-D tensor +class TestBilinearInterp_attr_tensor_Case2(TestBilinearInterpOp_attr_tensor): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 0. + self.out_size = np.array([66, 40]).astype("int32") + self.align_corners = True + self.shape_by_1Dtensor = True + + +# scale is a 1-D tensor +class TestBilinearInterp_attr_tensor_Case3(TestBilinearInterpOp_attr_tensor): + + def init_test_case(self): + self.interp_method = 'bilinear' + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 2.0 + self.out_size = None + self.align_corners = True + self.scale_by_1Dtensor = True + + +#TODO: comment this test for now until bilinear_interp_op added. +# class TestBilinearInterpOpAPI(unittest.TestCase): +# def test_case(self): +# x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") + +# dim = fluid.data(name="dim", shape=[1], dtype="int32") +# shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") +# actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") +# scale_tensor = fluid.data( +# name="scale_tensor", shape=[1], dtype="float32") + +# out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12]) +# out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim]) +# out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor) +# out4 = fluid.layers.resize_bilinear( +# x, out_shape=[4, 4], actual_shape=actual_size) +# out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor) + +# x_data = np.random.random((2, 3, 6, 6)).astype("float32") +# dim_data = np.array([12]).astype("int32") +# shape_data = np.array([12, 12]).astype("int32") +# actual_size_data = np.array([12, 12]).astype("int32") +# scale_data = np.array([2.0]).astype("float32") + +# if core.is_compiled_with_mlu(): +# place = paddle.device.MLUPlace(0) +# else: +# place = core.CPUPlace() +# exe = fluid.Executor(place) +# exe.run(fluid.default_startup_program()) +# results = exe.run(fluid.default_main_program(), +# feed={ +# "x": x_data, +# "dim": dim_data, +# "shape_tensor": shape_data, +# "actual_size": actual_size_data, +# "scale_tensor": scale_data +# }, +# fetch_list=[out1, out2, out3, out4, out5], +# return_numpy=True) + +# expect_res = bilinear_interp_np( +# x_data, out_h=12, out_w=12, align_corners=True) +# for res in results: +# self.assertTrue(np.allclose(res, expect_res)) + + +class TestBilinearInterpOpAPI_dy(unittest.TestCase): + + def test_case(self): + import paddle + if core.is_compiled_with_mlu(): + place = paddle.device.MLUPlace(0) + else: + place = core.CPUPlace() + with fluid.dygraph.guard(place): + input_data = np.random.random((2, 3, 6, 6)).astype("float32") + input_data = np.load('input.npy').astype("float32") + # print(input_data) + input_x = paddle.to_tensor(input_data) + expect_res = bilinear_interp_np(input_data, + out_h=12, + out_w=12, + align_corners=False) + out = interpolate(x=input_x, + size=[12, 12], + mode="bilinear", + align_corners=False) + self.assertTrue(np.allclose(out.numpy(), expect_res)) + + +class TestBilinearInterpOpAPI_dy2(unittest.TestCase): + + def test_case(self): + import paddle + if core.is_compiled_with_mlu(): + place = paddle.device.MLUPlace(0) + else: + place = core.CPUPlace() + with fluid.dygraph.guard(place): + input_data = np.random.random((2, 3, 6, 6)).astype("float32") + size_np = np.array([12, 12]).astype("int64") + input_x = paddle.to_tensor(input_data) + size = paddle.to_tensor(size_np) + expect_res = bilinear_interp_np(input_data, + out_h=12, + out_w=12, + align_corners=False) + out = interpolate(x=input_x, + size=size, + mode="bilinear", + align_corners=False) + self.assertTrue(np.allclose(out.numpy(), expect_res)) + + +class TestBilinearInterpOpAPI_dy3(unittest.TestCase): + + def test_case(self): + import paddle + if core.is_compiled_with_mlu(): + place = paddle.device.MLUPlace(0) + else: + place = core.CPUPlace() + with fluid.dygraph.guard(place): + input_data = np.random.random((2, 3, 6, 6)).astype("float32") + size_1 = np.array([12]).astype("int64") + input_x = paddle.to_tensor(input_data) + size = paddle.to_tensor(size_1) + expect_res = bilinear_interp_np(input_data, + out_h=12, + out_w=12, + align_corners=False) + out = interpolate(x=input_x, + size=[size, size], + mode="bilinear", + align_corners=False) + self.assertTrue(np.allclose(out.numpy(), expect_res)) + + +class TestBilinearInterpOpAPI_dy4(unittest.TestCase): + + def test_case(self): + import paddle + if core.is_compiled_with_mlu(): + place = paddle.device.MLUPlace(0) + else: + place = core.CPUPlace() + with fluid.dygraph.guard(place): + input_data = np.random.random((2, 3, 6, 6)).astype("float32") + scale_np = np.array([2, 2]).astype("int64") + input_x = paddle.to_tensor(input_data) + scale = paddle.to_tensor(scale_np) + expect_res = bilinear_interp_np(input_data, + out_h=12, + out_w=12, + align_corners=False) + out = interpolate(x=input_x, + scale_factor=scale, + mode="bilinear", + align_corners=False) + + self.assertTrue(np.allclose(out.numpy(), expect_res)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_depthwise_conv_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_depthwise_conv_mlu.py new file mode 100644 index 0000000000000..8d239732e7342 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_conv2d_op_depthwise_conv_mlu.py @@ -0,0 +1,238 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys + +sys.path.append("..") + +import paddle + +paddle.enable_static() +import paddle.fluid.core as core +import paddle.fluid as fluid +from op_test import OpTest +from paddle.fluid import Program, program_guard +from test_conv2d_op_mlu import TestConv2DOp, TestConv2DOp_v2, create_test_padding_SAME_class, create_test_padding_VALID_class, create_test_channel_last_class, create_test_fp16_class + +#----------------TestDepthwiseConv ----- + + +class TestDepthwiseConv(TestConv2DOp): + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv2(TestConv2DOp): + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv3(TestConv2DOp): + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConvandFuse(TestConv2DOp): + + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv2andFuse(TestConv2DOp): + + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv3andFuse(TestConv2DOp): + + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + +class TestDepthwiseConv_AsyPadding(TestConv2DOp_v2): + + def init_test_case(self): + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + def init_paddings(self): + self.pad = [1, 1, 0, 1] + self.padding_algorithm = "EXPLICIT" + + +class TestDepthwiseConv2_AsyPadding(TestConv2DOp_v2): + + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + def init_paddings(self): + self.pad = [0, 1, 0, 2] + self.padding_algorithm = "EXPLICIT" + + +class TestDepthwiseConv3_AsyPadding(TestConv2DOp_v2): + + def init_test_case(self): + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + def init_paddings(self): + self.pad = [1, 1, 0, 0] + self.padding_algorithm = "EXPLICIT" + + +class TestDepthwiseConvandFuse_AsyPadding(TestConv2DOp_v2): + + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + def init_paddings(self): + self.pad = [2, 1, 2, 3] + self.padding_algorithm = "EXPLICIT" + + +class TestDepthwiseConv2andFuse_AsyPadding(TestConv2DOp_v2): + + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [12, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + def init_paddings(self): + self.pad = [1, 1, 1, 2] + self.padding_algorithm = "EXPLICIT" + + +class TestDepthwiseConv3andFuse_AsyPadding(TestConv2DOp_v2): + + def init_test_case(self): + self.fuse_relu_before_depthwise_conv = True + self.pad = [1, 1] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.groups = 3 + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [24, f_c, 3, 3] + self.op_type = "depthwise_conv2d" + + def init_paddings(self): + self.pad = [1, 2, 0, 2] + self.padding_algorithm = "EXPLICIT" + + +# depthwise conv2d + +create_test_padding_SAME_class(TestDepthwiseConv_AsyPadding) +create_test_padding_SAME_class(TestDepthwiseConvandFuse_AsyPadding) + +create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding) +create_test_padding_VALID_class(TestDepthwiseConvandFuse_AsyPadding) + +# channel last + +create_test_channel_last_class(TestDepthwiseConv_AsyPadding) +create_test_channel_last_class(TestDepthwiseConvandFuse_AsyPadding) + +create_test_fp16_class(TestDepthwiseConv_AsyPadding) +create_test_fp16_class(TestDepthwiseConvandFuse_AsyPadding) + +# TODO(MLU): Depthwise opration does not support dilation yet +# it will throw an error of CNNL_STATUS_NOT_SUPPORTED. + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py new file mode 100644 index 0000000000000..b4c74a99d85b7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_max_op_mlu.py @@ -0,0 +1,241 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys + +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid.core import ops + +paddle.enable_static() +SEED = 2022 + + +class TestElementwiseMax(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.maximum(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestElementwiseMaxFp16(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.maximum(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestElementwiseMaxInt32(OpTest): + + def init_dtype(self): + self.dtype = np.int32 + + +class TestTestElementwiseMax_Vector(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [100]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [100]).astype("float32") + } + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestTestElementwiseMax_broadcast_0(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [100, 3, 4]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [100]).astype("float32") + } + + self.attrs = {'axis': 0} + self.outputs = { + 'Out': np.maximum(self.inputs['X'], + self.inputs['Y'].reshape(100, 1, 1)) + } + + +class TestTestElementwiseMax_broadcast_1(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 100, 4]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [100]).astype("float32") + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': np.maximum(self.inputs['X'], + self.inputs['Y'].reshape(1, 100, 1)) + } + + +class TestTestElementwiseMax_broadcast_2(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [100]).astype("float32") + } + + self.outputs = { + 'Out': np.maximum(self.inputs['X'], + self.inputs['Y'].reshape(1, 1, 100)) + } + + +class TestTestElementwiseMax_broadcast_3(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 10, 12, 5]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [10, 12]).astype("float32") + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': + np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 10, 12, 1)) + } + + +class TestTestElementwiseMax_broadcast_4(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 50]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 1, 50]).astype("float32") + } + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestTestElementwiseMax_broadcast_5(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 4, 20]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 3, 1, 20]).astype("float32") + } + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestTestElementwiseMax_commonuse_1(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 3, 100]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [1, 1, 100]).astype("float32"), + } + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestTestElementwiseMax_commonuse_2(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [30, 3, 1, 5]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [30, 1, 4, 1]).astype("float32"), + } + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +class TestTestElementwiseMax_xsize_lessthan_ysize(TestElementwiseMax): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_max" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [10, 12]).astype("float32"), + 'Y': np.random.uniform(0.1, 1, [2, 3, 10, 12]).astype("float32"), + } + + self.attrs = {'axis': 2} + + self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py new file mode 100644 index 0000000000000..82aeb577205d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_log_op_mlu.py @@ -0,0 +1,216 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys + +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestActivation(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "exp" + self.init_dtype() + self.init_kernel_type() + self.python_api = paddle.exp + + np.random.seed(2049) + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.exp(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_dtype(self): + self.dtype = np.float32 + + def init_kernel_type(self): + pass + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + __class__.no_need_check_grad = True + + +class TestLog(TestActivation): + + def setUp(self): + self.set_mlu() + self.op_type = "log" + self.python_api = paddle.log + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.log(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_error(self): + in1 = fluid.layers.data(name="in1", + shape=[11, 17], + append_batch_size=False, + dtype="int32") + in2 = fluid.layers.data(name="in2", + shape=[11, 17], + append_batch_size=False, + dtype="int64") + + self.assertRaises(TypeError, fluid.layers.log, in1) + self.assertRaises(TypeError, fluid.layers.log, in2) + + +class TestLog2(TestActivation): + + def setUp(self): + self.set_mlu() + self.op_type = "log2" + self.python_api = paddle.log2 + self.init_dtype() + + x = np.random.uniform(1, 10, [11, 17]).astype(self.dtype) + out = np.log2(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_error(self): + in1 = paddle.static.data(name="in1", shape=[11, 17], dtype="int32") + in2 = paddle.static.data(name="in2", shape=[11, 17], dtype="int64") + + self.assertRaises(TypeError, paddle.log2, in1) + self.assertRaises(TypeError, paddle.log2, in2) + + def test_api(self): + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32") + data_x = paddle.static.data(name="data_x", + shape=[11, 17], + dtype="float32") + + out1 = paddle.log2(data_x) + exe = paddle.static.Executor(place=fluid.CPUPlace()) + exe.run(paddle.static.default_startup_program()) + res1 = exe.run(paddle.static.default_main_program(), + feed={"data_x": input_x}, + fetch_list=[out1]) + expected_res = np.log2(input_x) + self.assertTrue(np.allclose(res1, expected_res)) + + # dygraph + with fluid.dygraph.guard(): + np_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32") + data_x = paddle.to_tensor(np_x) + z = paddle.log2(data_x) + np_z = z.numpy() + z_expected = np.array(np.log2(np_x)) + np.savetxt("np_z.txt", np_z.flatten(), fmt="%.4f") + np.savetxt("z_expected.txt", z_expected.flatten(), fmt="%.4f") + self.assertTrue(np.allclose(np_z, z_expected, atol=1e-6)) + + +class TestLog10(TestActivation): + + def setUp(self): + self.set_mlu() + self.op_type = "log10" + self.python_api = paddle.log10 + self.init_dtype() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.log10(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_error(self): + in1 = paddle.static.data(name="in1", shape=[11, 17], dtype="int32") + in2 = paddle.static.data(name="in2", shape=[11, 17], dtype="int64") + + self.assertRaises(TypeError, paddle.log10, in1) + self.assertRaises(TypeError, paddle.log10, in2) + + def test_api(self): + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32") + data_x = paddle.static.data(name="data_x", + shape=[11, 17], + dtype="float32") + + out1 = paddle.log10(data_x) + exe = paddle.static.Executor(place=paddle.CPUPlace()) + exe.run(paddle.static.default_startup_program()) + res1 = exe.run(paddle.static.default_main_program(), + feed={"data_x": input_x}, + fetch_list=[out1]) + expected_res = np.log10(input_x) + self.assertTrue(np.allclose(res1, expected_res)) + + # dygraph + with fluid.dygraph.guard(): + np_x = np.random.uniform(0.1, 1, [11, 17]).astype("float32") + data_x = paddle.to_tensor(np_x) + z = paddle.log10(data_x) + np_z = z.numpy() + z_expected = np.array(np.log10(np_x)) + self.assertTrue(np.allclose(np_z, z_expected)) + + +class TestLogHalf(TestLog): + + def init_dtype(self): + self.dtype = np.float16 + + def test_api(self): + pass + + +class TestLog2Half(TestLog2): + + def init_dtype(self): + self.dtype = np.float16 + + def test_api(self): + pass + + +class TestLog10Half(TestLog10): + + def init_dtype(self): + self.dtype = np.float16 + + def test_api(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py index 17ef85dd2bd8a..2efa8823fdaf5 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_lookup_table_v2_op_mlu.py @@ -97,7 +97,6 @@ def init_dtype(self): def set_mlu(self): self.__class__.use_mlu = True self.place = paddle.device.MLUPlace(0) - self.__class__.no_need_check_grad = True class TestLookupTableV2Dim32(TestLookupTableV2): @@ -126,7 +125,6 @@ def init_dims(self): def set_mlu(self): self.__class__.use_mlu = True self.place = paddle.device.MLUPlace(0) - self.__class__.no_need_check_grad = True class TestLookupTableV2WithPadding(TestLookupTableV2): diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 7498fa72194d9..57e52206653c8 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -7,12 +7,13 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") if(WITH_ASCEND_CL) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) - endforeach(TEST_OP) + endforeach() # NOTE: NPU `get_float_status` read the value from register, During the test, - # it is found that this register will be overwritten by any program on the card. - # In order to prevent the interference of nan/inf in the other unittests, we - # need to set the unittests related to `float_status` to exclusive. + # it is found that this register will be overwritten by any program on the + # card. In order to prevent the interference of nan/inf in the other + # unittests, we need to set the unittests related to `float_status` to + # exclusive. set_tests_properties(test_amp_check_finite_and_scale_op_npu PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_flags_check_nan_inf_npu diff --git a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py index 7febcaba45cb4..d8f442c84411a 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py @@ -193,7 +193,7 @@ def set_attrs(self): } if self.use_variance: self.attrs['variance'] = self.prior_box_var.astype( - np.float).flatten() + np.float64).flatten() if self.axis != 0: self.attrs['axis'] = self.axis diff --git a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py index 7271644ce8294..14aec76af8b19 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_density_prior_box_op_npu.py @@ -84,7 +84,7 @@ def init_test_params(self): self.batch_size = 10 self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() + self.variances = np.array(self.variances, dtype=np.float64).flatten() self.clip = True self.num_priors = 0 diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py index f9f338a731079..dc7c6f0096bdf 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_zeros_like_op_npu.py @@ -48,7 +48,7 @@ def test_check_output(self): class TestFillZerosLikeOpBool(TestFillZerosLikeOp): def init_dtype(self): - self.dtype = np.bool + self.dtype = np.bool_ class TestFillZerosLikeOpFp16(TestFillZerosLikeOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py index cfd78c2b05b36..5290ad1c0d020 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_prior_box_op_npu.py @@ -95,9 +95,9 @@ def init_test_params(self): self.set_min_max_aspect_ratios_order() self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.aspect_ratios = np.array(self.aspect_ratios, - dtype=np.float).flatten() + dtype=np.float64).flatten() self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() + self.variances = np.array(self.variances, dtype=np.float64).flatten() self.clip = True self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py index 64f66476542da..5cedd90d2685e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py @@ -106,7 +106,7 @@ def setUp(self): } self.outputs = { 'Out': - self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.bool) + self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.bool_) } diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py index 85d1fe9478140..10aeea4dee5bf 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_min_op_npu.py @@ -106,7 +106,7 @@ def setUp(self): } self.outputs = { 'Out': - self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.bool) + self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.bool_) } diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py index c32e105b02ade..9e3bc365cee2e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py @@ -129,7 +129,8 @@ def setUp(self): self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.BOOL)} self.outputs = { 'Out': - self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(np.bool) + self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype( + np.bool_) } diff --git a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py index 76fc5846534ac..e5fd042674204 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_size_op_npu.py @@ -72,7 +72,7 @@ class TestSizeOp4(TestSizeOp): def config(self): self.shape = [2**10] - self.dtype = np.bool + self.dtype = np.bool_ class TestSizeOp5(TestSizeOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py index b3d5fa9a6b5c9..bdc68d43a2241 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py @@ -199,7 +199,7 @@ def test_check_output(self): self.check_output_with_place(self.place) def init_dtype(self): - self.dtype = np.bool + self.dtype = np.bool_ def initTestCase(self): self.real_op_type = np.random.choice(['triu', 'tril']) diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py index 21be9e295d2e1..48df4ad454aad 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_min_op_npu.py @@ -37,7 +37,7 @@ def setUp(self): self.init() fluid.core.globals()['FLAGS_min_loss_scaling'] = 1639 - found_inf = np.array([True], dtype=np.bool) + found_inf = np.array([True], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) i = np.random.randint(0, 1024, 1) j = np.random.randint(0, 1024, 1) diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py index 5299369ff1743..678e50247afc8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py @@ -34,7 +34,7 @@ def setUp(self): self.place = paddle.NPUPlace(0) self.init() - found_inf = np.array([False], dtype=np.bool) + found_inf = np.array([False], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) self.inputs = { @@ -82,7 +82,7 @@ def setUp(self): self.place = paddle.NPUPlace(0) self.init() - found_inf = np.array([True], dtype=np.bool) + found_inf = np.array([True], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) i = np.random.randint(0, 1024, 1) j = np.random.randint(0, 1024, 1) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index ded9f188472dd..ba694f5353083 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -471,7 +471,7 @@ def infer_dtype(numpy_dict, dtype_set): np.dtype(np.int16), np.dtype(np.int8), np.dtype(np.uint8), - np.dtype(np.bool) + np.dtype(np.bool_) ] # check the dtype in dtype_list in order, select the first dtype that in dtype_set for dtype in dtype_list: diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt index ab985d73d5387..cb566a41aaaab 100755 --- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt @@ -8,4 +8,4 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) list(APPEND TEST_OPS ${TEST_OP}) set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50) -endforeach(TEST_OP) +endforeach() diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt index 35a95749880bd..04773499b3591 100644 --- a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt @@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() if(NOT WIN32) set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120) set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt index 5c13f56d44646..a3f2059881bb8 100644 --- a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt @@ -6,7 +6,7 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() set_tests_properties(test_sequence_conv PROPERTIES TIMEOUT 120) set_tests_properties(test_sequence_concat PROPERTIES TIMEOUT 120) set_tests_properties(test_sequence_pool PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py index 908af43e00825..e34da4c45a7c3 100644 --- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py +++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py @@ -20,156 +20,11 @@ import paddle.fluid as fluid from test_dist_base import TestDistRunnerBase, runtime_main import paddle.distributed.fleet as fleet -import paddle.incubate.nn.functional as incubate_f - -from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype -from paddle.fluid.dygraph.layers import Layer -from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid import core -from paddle.nn.initializer import Constant +from paddle.incubate.nn import FusedMultiHeadAttention paddle.enable_static() -def _set_var_distributed(var): - if var is None: - return - - var.is_distributed = True - - # NOTE: use current_block and find_var_recursive to support while_loop - startup_block = paddle.static.default_startup_program().current_block() - main_block = paddle.static.default_main_program().current_block() - startup_block._find_var_recursive(var.name).is_distributed = True - main_block._find_var_recursive(var.name).is_distributed = True - - -class ParallelFusedMultiHeadAttention(Layer): - - def __init__(self, - embed_dim, - num_heads, - dropout_rate=0.5, - attn_dropout_rate=0.5, - kdim=None, - vdim=None, - normalize_before=False, - need_weights=False, - qkv_weight_attr=None, - qkv_bias_attr=None, - linear_weight_attr=None, - linear_bias_attr=None, - pre_ln_scale_attr=None, - pre_ln_bias_attr=None, - ln_scale_attr=None, - ln_bias_attr=None, - epsilon=1e-5, - nranks=1, - ring_id=-1, - name=None): - super(ParallelFusedMultiHeadAttention, self).__init__() - - assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " - "but received {}".format(embed_dim)) - assert num_heads > 0, ("Expected nhead to be greater than 0, " - "but received {}".format(num_heads)) - - self.normalize_before = normalize_before - self._dtype = self._helper.get_default_dtype() - self._epsilon = epsilon - self._ring_id = ring_id - - self.embed_dim = embed_dim - self.num_heads = num_heads - self.head_dim = embed_dim // num_heads - self.kdim = kdim - self.vdim = vdim - self.need_weights = need_weights - assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" - assert need_weights == False, "Only support need_weight is False now." - - # tensor model parallel - assert num_heads % nranks == 0 - num_heads = num_heads // nranks - - self.qkv_weight = self.create_parameter( - shape=[3, num_heads, self.head_dim, embed_dim], - attr=qkv_weight_attr, - dtype=self._dtype, - is_bias=False) - self.qkv_bias = self.create_parameter( - shape=[3, num_heads, self.head_dim], - attr=qkv_bias_attr, - dtype=self._dtype, - is_bias=True) - self.linear_weight = self.create_parameter( - shape=[num_heads * self.head_dim, embed_dim], - attr=linear_weight_attr, - dtype=self._dtype, - is_bias=False) - self.linear_bias = self.create_parameter(shape=[embed_dim], - attr=linear_bias_attr, - dtype=self._dtype, - is_bias=True) - - # tensor model parallel - if nranks > 1: - assert ring_id != -1 - # column parallel - _set_var_distributed(self.qkv_weight) - _set_var_distributed(self.qkv_bias) - # row parallel - _set_var_distributed(self.linear_weight) - - if normalize_before: - self.pre_ln_scale = self.create_parameter( - attr=pre_ln_scale_attr, - shape=[embed_dim], - default_initializer=Constant(value=1.0)) - self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr, - shape=[embed_dim], - is_bias=True) - self.ln_scale = None - self.ln_bias = None - else: - self.pre_ln_scale = None - self.pre_ln_bias = None - self.ln_scale = self.create_parameter( - attr=ln_scale_attr, - shape=[embed_dim], - default_initializer=Constant(value=1.0)) - self.ln_bias = self.create_parameter(attr=ln_bias_attr, - shape=[embed_dim], - is_bias=True) - - self.dropout_rate = dropout_rate - self.attn_dropout_rate = attn_dropout_rate - - self.name = name - - def forward(self, query, key=None, value=None, attn_mask=None, cache=None): - out = incubate_f.fused_multi_head_attention( - x=query, - qkv_weight=self.qkv_weight, - linear_weight=self.linear_weight, - pre_layer_norm=self.normalize_before, - pre_ln_scale=self.pre_ln_scale, - pre_ln_bias=self.pre_ln_bias, - ln_scale=self.ln_scale, - ln_bias=self.ln_bias, - pre_ln_epsilon=self._epsilon, - qkv_bias=self.qkv_bias, - linear_bias=self.linear_bias, - attn_mask=attn_mask, - dropout_rate=self.dropout_rate, - attn_dropout_rate=self.attn_dropout_rate, - ln_epsilon=self._epsilon, - training=self.training, - ring_id=self._ring_id, - name=self.name) - return out - - def get_param_attr(weight, bias): weight_attr = paddle.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(weight)) @@ -208,40 +63,40 @@ def create_model(data, rank): qkv_w_attr, qkv_b_attr = get_param_attr(col_qkv_w, col_qkv_b) linear_w_attr, linear_b_attr = get_param_attr(row_linear_w, linear_b) - attn = ParallelFusedMultiHeadAttention(hidden, - n_head, - dropout_rate=0.0, - attn_dropout_rate=0.0, - normalize_before=False, - qkv_weight_attr=qkv_w_attr, - qkv_bias_attr=qkv_b_attr, - linear_weight_attr=linear_w_attr, - linear_bias_attr=linear_b_attr, - pre_ln_scale_attr=pre_ln_w_attr, - pre_ln_bias_attr=pre_ln_b_attr, - ln_scale_attr=pre_ln_w_attr, - ln_bias_attr=pre_ln_b_attr, - nranks=MODEL_PARALLEL_SIZE, - ring_id=0) + attn = FusedMultiHeadAttention(hidden, + n_head, + dropout_rate=0.0, + attn_dropout_rate=0.0, + normalize_before=False, + qkv_weight_attr=qkv_w_attr, + qkv_bias_attr=qkv_b_attr, + linear_weight_attr=linear_w_attr, + linear_bias_attr=linear_b_attr, + pre_ln_scale_attr=pre_ln_w_attr, + pre_ln_bias_attr=pre_ln_b_attr, + ln_scale_attr=pre_ln_w_attr, + ln_bias_attr=pre_ln_b_attr, + nranks=MODEL_PARALLEL_SIZE, + ring_id=0) result = attn(data) else: pre_ln_w_attr, pre_ln_b_attr = get_param_attr(pre_ln_w, pre_ln_b) qkv_w_attr, qkv_b_attr = get_param_attr(qkv_w, qkv_b) linear_w_attr, linear_b_attr = get_param_attr(linear_w, linear_b) - attn = ParallelFusedMultiHeadAttention(hidden, - n_head, - dropout_rate=0.0, - attn_dropout_rate=0.0, - normalize_before=False, - qkv_weight_attr=qkv_w_attr, - qkv_bias_attr=qkv_b_attr, - linear_weight_attr=linear_w_attr, - linear_bias_attr=linear_b_attr, - pre_ln_scale_attr=pre_ln_w_attr, - pre_ln_bias_attr=pre_ln_b_attr, - ln_scale_attr=pre_ln_w_attr, - ln_bias_attr=pre_ln_b_attr) + attn = FusedMultiHeadAttention(hidden, + n_head, + dropout_rate=0.0, + attn_dropout_rate=0.0, + normalize_before=False, + qkv_weight_attr=qkv_w_attr, + qkv_bias_attr=qkv_b_attr, + linear_weight_attr=linear_w_attr, + linear_bias_attr=linear_b_attr, + pre_ln_scale_attr=pre_ln_w_attr, + pre_ln_bias_attr=pre_ln_b_attr, + ln_scale_attr=pre_ln_w_attr, + ln_bias_attr=pre_ln_b_attr) result = attn(data) predict = paddle.sum(result) diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py index a5af3cd877c53..d2144e201dc22 100644 --- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py +++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py @@ -20,11 +20,7 @@ import paddle.fluid as fluid from test_dist_base import TestDistRunnerBase, runtime_main import paddle.distributed.fleet as fleet - -from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype -from paddle.fluid.dygraph.layers import Layer -from paddle.fluid.layer_helper import LayerHelper -from paddle.nn.initializer import Constant +from paddle.incubate.nn import FusedFeedForward paddle.enable_static() @@ -34,239 +30,6 @@ OUT_SIZE = 2 * MODEL_PARALLEL_SIZE -def fused_feedforward(x, - linear1_weight, - linear2_weight, - linear1_bias=None, - linear2_bias=None, - ln1_scale=None, - ln1_bias=None, - ln2_scale=None, - ln2_bias=None, - dropout1_rate=0.5, - dropout2_rate=0.5, - activation="relu", - ln1_epsilon=1e-5, - ln2_epsilon=1e-5, - pre_layer_norm=False, - training=True, - mode='upscale_in_train', - ring_id=-1, - name=None): - seed = None - if mode not in ('downscale_in_infer', 'upscale_in_train'): - raise ValueError( - "mode argument should be 'downscale_in_infer' or 'upscale_in_train'" - ) - mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer - - helper = LayerHelper("fused_feedforward") - dtype = x.dtype - check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], - 'fused_feedforward') - check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], - 'fused_feedforward') - - out = helper.create_variable_for_type_inference(x.dtype) - dropout1_mask = helper.create_variable_for_type_inference( - 'uint8', stop_gradient=True) - dropout2_mask = helper.create_variable_for_type_inference( - 'uint8', stop_gradient=True) - ln1_mean = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - ln1_variance = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - ln2_mean = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - ln2_variance = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - linear1_out = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - ln1_out = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - dropout1_out = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - dropout2_out = helper.create_variable_for_type_inference(x.dtype, - stop_gradient=True) - - if (seed is None or seed == 0) and helper.main_program.random_seed != 0: - seed = helper.main_program.random_seed - - helper.append_op(type='fused_feedforward', - inputs={ - 'X': x, - 'Linear1Weight': linear1_weight, - 'Linear1Bias': linear1_bias, - 'Linear2Weight': linear2_weight, - 'Linear2Bias': linear2_bias, - 'Ln1Scale': ln1_scale, - 'Ln1Bias': ln1_bias, - 'Ln2Scale': ln2_scale, - 'Ln2Bias': ln2_bias, - }, - outputs={ - 'Out': out, - 'Dropout1Mask': dropout1_mask, - 'Dropout2Mask': dropout2_mask, - 'Ln1Mean': ln1_mean, - 'Ln1Variance': ln1_variance, - 'Ln2Mean': ln2_mean, - 'Ln2Variance': ln2_variance, - 'Linear1Out': linear1_out, - 'Ln1Out': ln1_out, - 'Dropout1Out': dropout1_out, - 'Dropout2Out': dropout2_out, - }, - attrs={ - 'dropout1_rate': dropout1_rate, - 'dropout2_rate': dropout2_rate, - 'act_method': activation, - 'pre_layer_norm': pre_layer_norm, - 'ln1_epsilon': ln1_epsilon, - 'ln2_epsilon': ln2_epsilon, - 'dropout1_is_test': not training, - 'dropout2_is_test': not training, - 'dropout1_fix_seed': seed is not None, - 'dropout2_fix_seed': seed is not None, - 'dropout1_seed': seed if seed is not None else 0, - 'dropout2_seed': seed if seed is not None else 0, - 'dropout1_implementation': mode, - 'dropout2_implementation': mode, - 'ring_id': ring_id, - }) - return out - - -def _set_var_distributed(var): - if var is None: - return - - var.is_distributed = True - - # NOTE: use current_block and find_var_recursive to support while_loop - startup_block = paddle.static.default_startup_program().current_block() - main_block = paddle.static.default_main_program().current_block() - startup_block._find_var_recursive(var.name).is_distributed = True - main_block._find_var_recursive(var.name).is_distributed = True - - -class ParallelFusedFeedForward(Layer): - - def __init__(self, - d_model, - dim_feedforward, - dropout_rate=0.1, - epsilon=1e-05, - activation="relu", - act_dropout_rate=None, - normalize_before=False, - linear1_weight_attr=None, - linear1_bias_attr=None, - linear2_weight_attr=None, - linear2_bias_attr=None, - ln1_scale_attr=None, - ln1_bias_attr=None, - ln2_scale_attr=None, - ln2_bias_attr=None, - nranks=1, - ring_id=-1, - name=None): - super(ParallelFusedFeedForward, self).__init__() - assert d_model > 0, ( - "Expected d_model to be greater than 0, but received {}".format( - d_model)) - assert dim_feedforward > 0, ( - "Expected dim_feedforward to be greater than 0, but received {}". - format(dim_feedforward)) - - self._dtype = self._helper.get_default_dtype() - self._d_model = d_model - - assert dim_feedforward % nranks == 0 - dim_feedforward = dim_feedforward // nranks - self._dim_feedforward = dim_feedforward - self._dropout_rate = dropout_rate - self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate - self._act_method = activation - self._normalize_before = normalize_before - self._epsilon = epsilon - self._ring_id = ring_id - - self._linear1_weight = self.create_parameter( - shape=[d_model, dim_feedforward], - attr=linear1_weight_attr, - dtype=self._dtype, - is_bias=False) - self._linear1_bias = self.create_parameter(shape=[dim_feedforward], - attr=linear1_bias_attr, - dtype=self._dtype, - is_bias=True) - - self._linear2_weight = self.create_parameter( - shape=[dim_feedforward, d_model], - attr=linear2_weight_attr, - dtype=self._dtype, - is_bias=False) - - self._linear2_bias = self.create_parameter(shape=[d_model], - attr=linear2_bias_attr, - dtype=self._dtype, - is_bias=True) - - if nranks > 1: - assert ring_id != -1 - # column parallel - _set_var_distributed(self._linear1_weight) - _set_var_distributed(self._linear1_bias) - _set_var_distributed(self._linear2_weight) - - if normalize_before: - self._ln1_scale = self.create_parameter( - shape=[d_model], - attr=ln1_scale_attr, - is_bias=False, - default_initializer=Constant(1.0)) - self._ln1_bias = self.create_parameter(shape=[d_model], - attr=ln1_bias_attr, - is_bias=True) - self._ln2_scale = None - self._ln2_bias = None - else: - self._ln1_bias = None - self._ln2_bias = None - self._ln2_scale = self.create_parameter( - shape=[d_model], - attr=ln2_scale_attr, - is_bias=False, - default_initializer=Constant(1.0)) - self._ln2_bias = self.create_parameter(shape=[d_model], - attr=ln2_bias_attr, - is_bias=True) - - self.name = name - - def forward(self, src, cache=None): - out = fused_feedforward(src, - self._linear1_weight, - self._linear2_weight, - self._linear1_bias, - self._linear2_bias, - self._ln1_scale, - self._ln1_bias, - self._ln2_scale, - self._ln2_bias, - dropout1_rate=self._act_dropout_rate, - dropout2_rate=self._dropout_rate, - activation=self._act_method, - ln1_epsilon=self._epsilon, - ln2_epsilon=self._epsilon, - pre_layer_norm=self._normalize_before, - training=self.training, - ring_id=self._ring_id, - name=self.name) - return out - - def get_param_attr(weight, bias): weight_attr = paddle.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer(weight)) @@ -295,19 +58,19 @@ def create_model(data, rank): w0_attr, b0_attr = get_param_attr(col_w0, col_b0) w1_attr, b1_attr = get_param_attr(row_w1, b1) - ffn = ParallelFusedFeedForward(IN_SIZE, - OUT_SIZE, - dropout_rate=0.0, - activation='gelu', - normalize_before=True, - linear1_weight_attr=w0_attr, - linear1_bias_attr=b0_attr, - linear2_weight_attr=w1_attr, - linear2_bias_attr=b1_attr, - ln1_scale_attr=ln_w_attr, - ln1_bias_attr=ln_b_attr, - nranks=MODEL_PARALLEL_SIZE, - ring_id=0) + ffn = FusedFeedForward(IN_SIZE, + OUT_SIZE, + dropout_rate=0.0, + activation='gelu', + normalize_before=True, + linear1_weight_attr=w0_attr, + linear1_bias_attr=b0_attr, + linear2_weight_attr=w1_attr, + linear2_bias_attr=b1_attr, + ln1_scale_attr=ln_w_attr, + ln1_bias_attr=ln_b_attr, + nranks=MODEL_PARALLEL_SIZE, + ring_id=0) #ffn.eval() result = ffn(data) else: @@ -315,17 +78,17 @@ def create_model(data, rank): w0_attr, b0_attr = get_param_attr(w0, b0) w1_attr, b1_attr = get_param_attr(w1, b1) - ffn = ParallelFusedFeedForward(IN_SIZE, - OUT_SIZE, - dropout_rate=0.0, - activation='gelu', - normalize_before=True, - linear1_weight_attr=w0_attr, - linear1_bias_attr=b0_attr, - linear2_weight_attr=w1_attr, - linear2_bias_attr=b1_attr, - ln1_scale_attr=ln_w_attr, - ln1_bias_attr=ln_b_attr) + ffn = FusedFeedForward(IN_SIZE, + OUT_SIZE, + dropout_rate=0.0, + activation='gelu', + normalize_before=True, + linear1_weight_attr=w0_attr, + linear1_bias_attr=b0_attr, + linear2_weight_attr=w1_attr, + linear2_bias_attr=b1_attr, + ln1_scale_attr=ln_w_attr, + ln1_bias_attr=ln_b_attr) #ffn.eval() result = ffn(data) diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py index c35d7940a8a1c..31afb85750e8c 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op.py @@ -145,7 +145,7 @@ def test_assign_LoDTensorArray(self): def test_assign_NumpyArray(self): with fluid.dygraph.guard(): - array = np.random.random(size=(100, 10)).astype(np.bool) + array = np.random.random(size=(100, 10)).astype(np.bool_) result1 = paddle.zeros(shape=[3, 3], dtype='float32') paddle.assign(array, result1) self.assertTrue(np.allclose(result1.numpy(), array)) diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py index b99892c65e19f..873684576326a 100644 --- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py +++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py @@ -35,7 +35,7 @@ def bipartite_match(distance, match_indices, match_dist): match_sorted = sorted(match_pair, key=lambda tup: tup[2], reverse=True) - row_indices = -1 * np.ones((row, ), dtype=np.int) + row_indices = -1 * np.ones((row, ), dtype=np.int_) idx = 0 for i, j, dist in match_sorted: @@ -69,7 +69,7 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None): """ n = len(lod) m = distance.shape[1] - match_indices = -1 * np.ones((n, m), dtype=np.int) + match_indices = -1 * np.ones((n, m), dtype=np.int_) match_dist = np.zeros((n, m), dtype=np.float32) cur_offset = 0 for i in range(n): diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py index 63df37f912259..ee064963b2f22 100644 --- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py +++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py @@ -235,7 +235,7 @@ def setUp(self): self.attrs = { 'code_type': 'decode_center_size', 'box_normalized': False, - 'variance': prior_box_var.astype(np.float).flatten(), + 'variance': prior_box_var.astype(np.float64).flatten(), 'axis': axis } self.outputs = {'OutputBox': output_box} diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py index 06432e4b00720..a893b65f5a421 100755 --- a/python/paddle/fluid/tests/unittests/test_compare_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_op.py @@ -249,8 +249,8 @@ def test_bool_api_4(self): op = eval("paddle.%s" % (self.op_type)) out = op(x, y) exe = paddle.static.Executor(self.place) - input_x = np.array([True, False, True]).astype(np.bool) - input_y = np.array([True, True, False]).astype(np.bool) + input_x = np.array([True, False, True]).astype(np.bool_) + input_y = np.array([True, True, False]).astype(np.bool_) real_result = callback(input_x, input_y) res, = exe.run(feed={ "x": input_x, @@ -267,8 +267,8 @@ def test_bool_broadcast_api_4(self): op = eval("paddle.%s" % (self.op_type)) out = op(x, y) exe = paddle.static.Executor(self.place) - input_x = np.array([True, False, True]).astype(np.bool) - input_y = np.array([True]).astype(np.bool) + input_x = np.array([True, False, True]).astype(np.bool_) + input_y = np.array([True]).astype(np.bool_) real_result = callback(input_x, input_y) res, = exe.run(feed={ "x": input_x, diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py new file mode 100644 index 0000000000000..445211d35a1d9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static_run.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import unittest +import numpy as np +from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported, cuda_graph_transform + +paddle.enable_static() + + +class SimpleModel(nn.Layer): + + def __init__(self, in_size, out_size): + super(SimpleModel, self).__init__() + self.linear = nn.Linear(in_size, out_size) + self.dropout_1 = paddle.nn.Dropout(0.1) + self.relu = nn.ReLU() + self.dropout_2 = paddle.nn.Dropout(0.5) + self.gelu = nn.GELU() + + def forward(self, x): + x = self.linear(x) + x = self.dropout_1(x) + x = self.relu(x) + x = self.dropout_2(x) + x = self.gelu(x) + return x + + +class TestCudaGraphAttrAll(unittest.TestCase): + + def setUp(self): + paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 0.0}) + + def get_model(self, use_cuda_graph=False): + x = paddle.static.data(shape=[3, 10], dtype='float32', name='x') + + model_start = SimpleModel(10, 20) + if use_cuda_graph: + model_start = wrap_cuda_graph(model_start) + + model_inter = SimpleModel(20, 20) + + model_end = SimpleModel(20, 10) + if use_cuda_graph: + model_end = wrap_cuda_graph(model_end, memory_pool='new') + + start_out = model_start(x) + inter_out = model_inter(start_out) + end_out = model_end(inter_out) + loss = paddle.mean(end_out) + + opt = paddle.optimizer.SGD() + opt.minimize(loss) + + return loss + + def run_with_cuda_graph(self, x_data): + # run with cuda graph + paddle.seed(1024) + + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + + with paddle.static.program_guard(main_prog, start_prog): + loss = self.get_model(use_cuda_graph=True) + + section_programs = cuda_graph_transform(main_prog) + assert len(section_programs) == 4 + + block = main_prog.global_block() + run_program_op_num = 0 + for op in block.ops: + if op.type == 'run_program': + run_program_op_num += 1 + assert run_program_op_num == 4 + + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + exe.run(start_prog) + + for i in range(10): + rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss]) + + return rst + + def normal_run(self, x_data): + # run without cuda graph + paddle.seed(1024) + + main_prog = paddle.static.Program() + start_prog = paddle.static.Program() + + with paddle.static.program_guard(main_prog, start_prog): + loss = self.get_model() + + exe = paddle.static.Executor(paddle.CUDAPlace(0)) + exe.run(start_prog) + + for i in range(10): + rst = exe.run(main_prog, feed={'x': x_data}, fetch_list=[loss]) + + return rst + + def test_static_mode_cuda_graph(self): + if not is_cuda_graph_supported(): + return + x_data = np.random.random((3, 10)).astype('float32') + cuda_graph_rst = self.run_with_cuda_graph(x_data) + normal_run_rst = self.normal_run(x_data) + assert np.array_equal(cuda_graph_rst, normal_run_rst) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py index c00e75882943f..4b5d283aa512a 100644 --- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py @@ -70,7 +70,7 @@ def init_test_params(self): self.batch_size = 10 self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() + self.variances = np.array(self.variances, dtype=np.float64).flatten() self.clip = True self.num_priors = 0 diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py index 727fcb28cc211..4cafd19d913b3 100644 --- a/python/paddle/fluid/tests/unittests/test_directory_migration.py +++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py @@ -17,14 +17,22 @@ import os import sys import time +import tempfile import subprocess import unittest + import numpy as np import paddle class TestDirectory(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + def get_import_command(self, module): paths = module.split('.') if len(paths) == 1: @@ -77,7 +85,7 @@ def test_new_directory(self): 'paddle.static.nn.spectral_norm', 'paddle.static.nn.embedding' ] - import_file = 'run_import_modules.py' + import_file = os.path.join(self.temp_dir.name, 'run_import_modules.py') with open(import_file, "w") as wb: for module in new_directory: @@ -137,7 +145,8 @@ def test_old_directory(self): 'paddle.declarative.spectral_norm', 'paddle.declarative.embedding' ] - import_file = 'run_old_import_modules.py' + import_file = os.path.join(self.temp_dir.name, + 'run_old_import_modules.py') with open(import_file, "w") as wb: cmd_context_count = """ diff --git a/python/paddle/fluid/tests/unittests/test_einsum_op.py b/python/paddle/fluid/tests/unittests/test_einsum_op.py index c36950b6922fe..e34d04be927cc 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_op.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_op.py @@ -39,7 +39,9 @@ def setUp(self): 'Out': out, "InnerCache": [('cache_' + str(i), np.array([1.0])) - for i in range(len(self.operands))] + for i in range(len(self.operands))], + "XShape": [('xshape_' + str(i), np.array([1.0])) + for i in range(len(self.operands))], } def init_input(self): @@ -48,14 +50,13 @@ def init_input(self): self.inputs.append(np.random.random(s).astype(t)) def set_mandatory(self): - self.disable = False self.shapes = [(10, 10, 20), (20, 6)] self.types = [np.float64, np.float64] self.equation = "mij,jk->ki" def test_check_output(self): if not self.disable: - self.check_output(no_check_set=["InnerCache"]) + self.check_output(no_check_set=["InnerCache", "XShape"]) def test_grad(self): if not self.disable: diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py index 29bfca4dd786b..2482ab0c549db 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py @@ -27,7 +27,7 @@ def compare(ref, res, atol, rtol): ref = np.array(ref).flatten() res = np.array(res).flatten() - tmp_ref = ref.astype(np.float) + tmp_ref = ref.astype(np.float64) tol = atol + rtol * abs(tmp_ref) diff = abs(res - ref) diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py index 89689942a0274..bf8983eee842f 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py +++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op_api.py @@ -83,7 +83,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias, if ln_bias is None: has_bias = False - if (pre_layer_norm): + if pre_layer_norm: ln_out = layer_norm(query, True, has_bias, ln_scale, ln_bias) num_head = qkv_weight.shape[1] @@ -97,7 +97,7 @@ def compute_reference(pre_layer_norm, query, attn_mask, ln_scale, ln_bias, if qkv_bias is not None: qkv_bias = qkv_bias.reshape(qkv_bias.shape[0] * qkv_bias.shape[1] * qkv_bias.shape[2]) - if (pre_layer_norm): + if pre_layer_norm: ln_out = ln_out.reshape(batch_size * seq_len, embed_dim) qkv = fc(ln_out, qkv_weight) if qkv_bias is not None: @@ -239,12 +239,12 @@ def run_imperative(self): attn_mask_tensor = paddle.to_tensor(self.attn_mask) else: attn_mask_tensor = None - fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads, - self.dropout_prob, - self.attn_dropout_prob, self.kdim, - self.vdim, self.pre_layer_norm, - self.need_weight, self.weight_attr, - self.bias_attr) + fused_attn = FusedMultiHeadAttention( + self.embed_dim, self.num_heads, self.dropout_prob, + self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm, + self.need_weight, self.weight_attr, self.bias_attr, + self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr, + self.weight_attr, self.bias_attr) if self.bias_attr is not False: qkv_bias = np.random.random( fused_attn.qkv_bias.shape).astype('float32') @@ -260,13 +260,19 @@ def run_imperative(self): if self.bias_attr is not False: fused_attn_qkv_bias = fused_attn.qkv_bias.numpy() fused_attn_linear_bias = fused_attn.linear_bias.numpy() - fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy() - fused_attn_ln_bias = fused_attn.ln_bias.numpy() + if self.pre_layer_norm: + fused_attn_pre_ln_bias = fused_attn.pre_ln_bias.numpy() + fused_attn_ln_bias = None + else: + fused_attn_pre_ln_bias = None + fused_attn_ln_bias = fused_attn.ln_bias.numpy() ref_out = compute_reference( self.pre_layer_norm, self.query, self.attn_mask, - fused_attn.pre_ln_scale.numpy(), fused_attn_pre_ln_bias, - fused_attn.ln_scale.numpy(), fused_attn_ln_bias, + fused_attn.pre_ln_scale.numpy() if self.pre_layer_norm else None, + fused_attn_pre_ln_bias, + fused_attn.ln_scale.numpy() if not self.pre_layer_norm else None, + fused_attn_ln_bias, fused_attn.qkv_weight.numpy(), fused_attn_qkv_bias, fused_attn.linear_weight.numpy(), fused_attn_linear_bias) np.testing.assert_allclose(ref_out, @@ -275,12 +281,12 @@ def run_imperative(self): atol=self.atol) def run_static(self): - fused_attn = FusedMultiHeadAttention(self.embed_dim, self.num_heads, - self.dropout_prob, - self.attn_dropout_prob, self.kdim, - self.vdim, self.pre_layer_norm, - self.need_weight, self.weight_attr, - self.bias_attr) + fused_attn = FusedMultiHeadAttention( + self.embed_dim, self.num_heads, self.dropout_prob, + self.attn_dropout_prob, self.kdim, self.vdim, self.pre_layer_norm, + self.need_weight, self.weight_attr, self.bias_attr, + self.weight_attr, self.bias_attr, self.weight_attr, self.bias_attr, + self.weight_attr, self.bias_attr) x = paddle.static.data( name='X', @@ -304,58 +310,118 @@ def run_static(self): qkv_bias = None linear_bias = None + ln_scale = None + ln_2_scale = None ln_bias = None ln_2_bias = None if self.has_attn_mask: if self.bias_attr is False: - out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run( - paddle.static.default_main_program(), - feed={ - "X": self.query, - "SrcMask": self.attn_mask - }, - fetch_list=[ - final_out, fused_attn.qkv_weight, - fused_attn.linear_weight, fused_attn.pre_ln_scale, - fused_attn.ln_scale - ]) + if self.pre_layer_norm: + out, qkv_weight, out_linear_weight, ln_scale = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + "SrcMask": self.attn_mask + }, + fetch_list=[ + final_out, + fused_attn.qkv_weight, + fused_attn.linear_weight, + fused_attn.pre_ln_scale, + ]) + else: + out, qkv_weight, out_linear_weight, ln_2_scale = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + "SrcMask": self.attn_mask + }, + fetch_list=[ + final_out, fused_attn.qkv_weight, + fused_attn.linear_weight, fused_attn.ln_scale + ]) else: - out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run( - paddle.static.default_main_program(), - feed={ - "X": self.query, - "SrcMask": self.attn_mask - }, - fetch_list=[ - final_out, fused_attn.qkv_weight, fused_attn.qkv_bias, - fused_attn.linear_weight, fused_attn.linear_bias, - fused_attn.pre_ln_scale, fused_attn.pre_ln_bias, - fused_attn.ln_scale, fused_attn.ln_bias - ]) + if self.pre_layer_norm: + out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + "SrcMask": self.attn_mask + }, + fetch_list=[ + final_out, + fused_attn.qkv_weight, + fused_attn.qkv_bias, + fused_attn.linear_weight, + fused_attn.linear_bias, + fused_attn.pre_ln_scale, + fused_attn.pre_ln_bias, + ]) + else: + out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + "SrcMask": self.attn_mask + }, + fetch_list=[ + final_out, fused_attn.qkv_weight, + fused_attn.qkv_bias, fused_attn.linear_weight, + fused_attn.linear_bias, fused_attn.ln_scale, + fused_attn.ln_bias + ]) else: if self.bias_attr is False: - out, qkv_weight, out_linear_weight, ln_scale, ln_2_scale = exe.run( - paddle.static.default_main_program(), - feed={ - "X": self.query, - }, - fetch_list=[ - final_out, fused_attn.qkv_weight, - fused_attn.linear_weight, fused_attn.pre_ln_scale, - fused_attn.ln_scale - ]) + if self.pre_layer_norm: + out, qkv_weight, out_linear_weight, ln_scale = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + }, + fetch_list=[ + final_out, + fused_attn.qkv_weight, + fused_attn.linear_weight, + fused_attn.pre_ln_scale, + ]) + else: + out, qkv_weight, out_linear_weight, ln_2_scale = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + }, + fetch_list=[ + final_out, fused_attn.qkv_weight, + fused_attn.linear_weight, fused_attn.ln_scale + ]) else: - out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias = exe.run( - paddle.static.default_main_program(), - feed={ - "X": self.query, - }, - fetch_list=[ - final_out, fused_attn.qkv_weight, fused_attn.qkv_bias, - fused_attn.linear_weight, fused_attn.linear_bias, - fused_attn.pre_ln_scale, fused_attn.pre_ln_bias, - fused_attn.ln_scale, fused_attn.ln_bias - ]) + if self.pre_layer_norm: + out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + }, + fetch_list=[ + final_out, + fused_attn.qkv_weight, + fused_attn.qkv_bias, + fused_attn.linear_weight, + fused_attn.linear_bias, + fused_attn.pre_ln_scale, + fused_attn.pre_ln_bias, + ]) + else: + out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_2_scale, ln_2_bias = exe.run( + paddle.static.default_main_program(), + feed={ + "X": self.query, + }, + fetch_list=[ + final_out, fused_attn.qkv_weight, + fused_attn.qkv_bias, fused_attn.linear_weight, + fused_attn.linear_bias, fused_attn.ln_scale, + fused_attn.ln_bias + ]) return out, qkv_weight, qkv_bias, out_linear_weight, linear_bias, ln_scale, ln_bias, ln_2_scale, ln_2_bias def test_static_api(self): diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py index ffe6fa8d41aa0..ecfc8a5bc292c 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_op.py @@ -164,7 +164,7 @@ def generate_input_data(self): self.attn_mask = (self.attn_mask - 1.0) * 1e4 else: self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e4 - elif self.attn_mask_type == np.bool: + elif self.attn_mask_type == np.bool_: if self.has_cache_kv and not self.gen_cache_kv: self.attn_mask[:, :, :, -2] = 0 else: @@ -395,7 +395,7 @@ def GetFusedMultiTransformerOut(self): epsilon = 1e-05 ln2_epsilon = 1e-05 - if attn_mask is not None and self.attn_mask_type != np.bool: + if attn_mask is not None and self.attn_mask_type != np.bool_: attn_mask = _convert_attention_mask(attn_mask, x.dtype) qkv_weights, qkv_biases = [], [] diff --git a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py index 8414cd941c207..a86fb0fc4596c 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py @@ -79,8 +79,8 @@ def poly2mask(xy, k, h, w): u.extend([int(xs + s * t + .5) for t in ts]) k = len(u) - x = np.zeros((k), np.int) - y = np.zeros((k), np.int) + x = np.zeros((k), np.int_) + y = np.zeros((k), np.int_) m = 0 for j in six.moves.xrange(1, k): if u[j] != u[j - 1]: @@ -116,7 +116,7 @@ def poly2mask(xy, k, h, w): b[m - 1] += a[j] j += 1 mask = decode(b, m) - mask = np.array(mask, dtype=np.int).reshape((w, h)) + mask = np.array(mask, dtype=np.int_).reshape((w, h)) mask = mask.transpose((1, 0)) return mask diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index 822a0fcc449dd..3da576045c587 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -26,13 +26,6 @@ from paddle.fluid.dygraph import Linear from paddle.fluid.framework import _test_eager_guard -# Can use Amusic dataset as the DeepCF describes. -DATA_PATH = os.environ.get('DATA_PATH', '') - -BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 128)) -NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5)) -NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) - class DMF(fluid.Layer): @@ -129,84 +122,90 @@ def forward(self, users, items): return prediction -def get_data(): - user_ids = [] - item_ids = [] - labels = [] - NUM_USERS = 100 - NUM_ITEMS = 1000 - matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32) +class TestDygraphDeepCF(unittest.TestCase): - for uid in range(NUM_USERS): - for iid in range(NUM_ITEMS): - label = float(random.randint(1, 6) == 1) + def setUp(self): + # Can use Amusic dataset as the DeepCF describes. + self.data_path = os.environ.get('DATA_PATH', '') + + self.batch_size = int(os.environ.get('BATCH_SIZE', 128)) + self.num_batches = int(os.environ.get('NUM_BATCHES', 5)) + self.num_epoches = int(os.environ.get('NUM_EPOCHES', 1)) + + def get_data(self): + user_ids = [] + item_ids = [] + labels = [] + NUM_USERS = 100 + NUM_ITEMS = 1000 + matrix = np.zeros([NUM_USERS, NUM_ITEMS], dtype=np.float32) + + for uid in range(NUM_USERS): + for iid in range(NUM_ITEMS): + label = float(random.randint(1, 6) == 1) + user_ids.append(uid) + item_ids.append(iid) + labels.append(label) + matrix[uid, iid] = label + indices = np.arange(len(user_ids)) + np.random.shuffle(indices) + users_np = np.array(user_ids, dtype=np.int32)[indices] + items_np = np.array(item_ids, dtype=np.int32)[indices] + labels_np = np.array(labels, dtype=np.float32)[indices] + return np.expand_dims(users_np, -1), \ + np.expand_dims(items_np, -1), \ + np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix + + def load_data(self): + sys.stderr.write('loading from %s\n' % self.data_path) + likes = dict() + num_users = -1 + num_items = -1 + with open(self.data_path, 'r') as f: + for l in f.readlines(): + uid, iid, rating = [int(v) for v in l.split('\t')] + num_users = max(num_users, uid + 1) + num_items = max(num_items, iid + 1) + if float(rating) > 0.0: + likes[(uid, iid)] = 1.0 + + user_ids = [] + item_ids = [] + labels = [] + matrix = np.zeros([num_users, num_items], dtype=np.float32) + for uid, iid in likes.keys(): user_ids.append(uid) item_ids.append(iid) - labels.append(label) - matrix[uid, iid] = label - indices = np.arange(len(user_ids)) - np.random.shuffle(indices) - users_np = np.array(user_ids, dtype=np.int32)[indices] - items_np = np.array(item_ids, dtype=np.int32)[indices] - labels_np = np.array(labels, dtype=np.float32)[indices] - return np.expand_dims(users_np, -1), \ - np.expand_dims(items_np, -1), \ - np.expand_dims(labels_np, -1), NUM_USERS, NUM_ITEMS, matrix - - -def load_data(DATA_PATH): - sys.stderr.write('loading from %s\n' % DATA_PATH) - likes = dict() - num_users = -1 - num_items = -1 - with open(DATA_PATH, 'r') as f: - for l in f.readlines(): - uid, iid, rating = [int(v) for v in l.split('\t')] - num_users = max(num_users, uid + 1) - num_items = max(num_items, iid + 1) - if float(rating) > 0.0: - likes[(uid, iid)] = 1.0 - - user_ids = [] - item_ids = [] - labels = [] - matrix = np.zeros([num_users, num_items], dtype=np.float32) - for uid, iid in likes.keys(): - user_ids.append(uid) - item_ids.append(iid) - labels.append(1.0) - matrix[uid, iid] = 1.0 - - negative = 0 - while negative < 3: - nuid = random.randint(0, num_users - 1) - niid = random.randint(0, num_items - 1) - if (nuid, niid) not in likes: - negative += 1 - user_ids.append(nuid) - item_ids.append(niid) - labels.append(0.0) - - indices = np.arange(len(user_ids)) - np.random.shuffle(indices) - users_np = np.array(user_ids, dtype=np.int32)[indices] - items_np = np.array(item_ids, dtype=np.int32)[indices] - labels_np = np.array(labels, dtype=np.float32)[indices] - return np.expand_dims(users_np, -1), \ - np.expand_dims(items_np, -1), \ - np.expand_dims(labels_np, -1), num_users, num_items, matrix - - -class TestDygraphDeepCF(unittest.TestCase): + labels.append(1.0) + matrix[uid, iid] = 1.0 + + negative = 0 + while negative < 3: + nuid = random.randint(0, num_users - 1) + niid = random.randint(0, num_items - 1) + if (nuid, niid) not in likes: + negative += 1 + user_ids.append(nuid) + item_ids.append(niid) + labels.append(0.0) + + indices = np.arange(len(user_ids)) + np.random.shuffle(indices) + users_np = np.array(user_ids, dtype=np.int32)[indices] + items_np = np.array(item_ids, dtype=np.int32)[indices] + labels_np = np.array(labels, dtype=np.float32)[indices] + return np.expand_dims(users_np, -1), \ + np.expand_dims(items_np, -1), \ + np.expand_dims(labels_np, -1), num_users, num_items, matrix def test_deefcf(self): seed = 90 - if DATA_PATH: + if self.data_path: (users_np, items_np, labels_np, num_users, num_items, - matrix) = load_data(DATA_PATH) + matrix) = self.load_data() else: (users_np, items_np, labels_np, num_users, num_items, - matrix) = get_data() + matrix) = self.get_data() paddle.seed(seed) paddle.framework.random._manual_program_seed(seed) startup = fluid.Program() @@ -228,17 +227,19 @@ def test_deefcf(self): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) exe.run(startup) - for e in range(NUM_EPOCHES): + for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) - for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): - if slice + BATCH_SIZE >= users_np.shape[0]: + for slice in range(0, self.batch_size * self.num_batches, + self.batch_size): + if slice + self.batch_size >= users_np.shape[0]: break static_loss = exe.run( main, feed={ - users.name: users_np[slice:slice + BATCH_SIZE], - items.name: items_np[slice:slice + BATCH_SIZE], - labels.name: labels_np[slice:slice + BATCH_SIZE] + users.name: users_np[slice:slice + self.batch_size], + items.name: items_np[slice:slice + self.batch_size], + labels.name: + labels_np[slice:slice + self.batch_size] }, fetch_list=[loss])[0] sys.stderr.write('static loss %s\n' % static_loss) @@ -250,18 +251,20 @@ def test_deefcf(self): deepcf = DeepCF(num_users, num_items, matrix) adam = fluid.optimizer.AdamOptimizer( 0.01, parameter_list=deepcf.parameters()) - for e in range(NUM_EPOCHES): + for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) - for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): - if slice + BATCH_SIZE >= users_np.shape[0]: + for slice in range(0, self.batch_size * self.num_batches, + self.batch_size): + if slice + self.batch_size >= users_np.shape[0]: break prediction = deepcf( - to_variable(users_np[slice:slice + BATCH_SIZE]), - to_variable(items_np[slice:slice + BATCH_SIZE])) + to_variable(users_np[slice:slice + self.batch_size]), + to_variable(items_np[slice:slice + self.batch_size])) loss = fluid.layers.reduce_sum( fluid.layers.log_loss( prediction, - to_variable(labels_np[slice:slice + BATCH_SIZE]))) + to_variable(labels_np[slice:slice + + self.batch_size]))) loss.backward() adam.minimize(loss) deepcf.clear_gradients() @@ -276,18 +279,20 @@ def test_deefcf(self): adam2 = fluid.optimizer.AdamOptimizer( 0.01, parameter_list=deepcf2.parameters()) fluid.set_flags({'FLAGS_sort_sum_gradient': True}) - for e in range(NUM_EPOCHES): + for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) - for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): - if slice + BATCH_SIZE >= users_np.shape[0]: + for slice in range(0, self.batch_size * self.num_batches, + self.batch_size): + if slice + self.batch_size >= users_np.shape[0]: break prediction2 = deepcf2( - to_variable(users_np[slice:slice + BATCH_SIZE]), - to_variable(items_np[slice:slice + BATCH_SIZE])) + to_variable(users_np[slice:slice + self.batch_size]), + to_variable(items_np[slice:slice + self.batch_size])) loss2 = fluid.layers.reduce_sum( fluid.layers.log_loss( prediction2, - to_variable(labels_np[slice:slice + BATCH_SIZE]))) + to_variable(labels_np[slice:slice + + self.batch_size]))) loss2.backward() adam2.minimize(loss2) deepcf2.clear_gradients() @@ -306,19 +311,22 @@ def test_deefcf(self): adam = fluid.optimizer.AdamOptimizer( 0.01, parameter_list=deepcf.parameters()) - for e in range(NUM_EPOCHES): + for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) - for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE): - if slice + BATCH_SIZE >= users_np.shape[0]: + for slice in range(0, self.batch_size * self.num_batches, + self.batch_size): + if slice + self.batch_size >= users_np.shape[0]: break prediction = deepcf( - to_variable(users_np[slice:slice + BATCH_SIZE]), - to_variable(items_np[slice:slice + BATCH_SIZE])) + to_variable(users_np[slice:slice + + self.batch_size]), + to_variable(items_np[slice:slice + + self.batch_size])) loss = fluid.layers.reduce_sum( fluid.layers.log_loss( prediction, to_variable(labels_np[slice:slice + - BATCH_SIZE]))) + self.batch_size]))) loss.backward() adam.minimize(loss) deepcf.clear_gradients() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index 6acab36221fa2..28d24f4b5b703 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -177,4 +177,5 @@ def test_gnn_float32(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py index f8f04229a4de8..a076b69cc0020 100644 --- a/python/paddle/fluid/tests/unittests/test_input_spec.py +++ b/python/paddle/fluid/tests/unittests/test_input_spec.py @@ -12,8 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import unittest +import tempfile import numpy as np + import paddle import paddle.fluid as fluid from paddle.static import InputSpec @@ -160,6 +163,10 @@ def setUp(self): self.out_num = 16 self.x_spec = paddle.static.InputSpec([-1, 16], name='x') self.x = paddle.randn([4, 16]) + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() @classmethod def setUpClass(cls): @@ -182,7 +189,7 @@ def test_non_tensor_list(self): self.check_result(specs, 'list') def check_result(self, specs, path): - path = './net_non_tensor_' + path + path = os.path.join(self.temp_dir.name, './net_non_tensor_', path) net = NetWithNonTensorSpec(self.in_num, self.out_num) net.eval() @@ -218,7 +225,7 @@ def test_spec_compatible(self): net = paddle.jit.to_static(net, input_spec=specs) net.eval() - path = './net_twice' + path = os.path.join(self.temp_dir.name, './net_twice') # NOTE: check input_specs_compatible new_specs = [self.x_spec, True, "bn", 10] @@ -264,6 +271,7 @@ def setUp(self): self.y_spec = paddle.static.InputSpec([16], name='y') self.x = paddle.randn([4, 16]) self.y = paddle.randn([16]) + self.temp_dir = tempfile.TemporaryDirectory() @classmethod def setUpClass(cls): @@ -271,7 +279,7 @@ def setUpClass(cls): def test_non_tensor_with_prune(self): specs = [self.x_spec, self.y_spec, True] - path = './net_non_tensor_prune_' + path = os.path.join(self.temp_dir.name, './net_non_tensor_prune_') net = NetWithNonTensorSpecPrune(self.in_num, self.out_num) net.eval() diff --git a/python/paddle/fluid/tests/unittests/test_kron_op.py b/python/paddle/fluid/tests/unittests/test_kron_op.py index 61b5b92c007e9..51a9cbc63d60f 100644 --- a/python/paddle/fluid/tests/unittests/test_kron_op.py +++ b/python/paddle/fluid/tests/unittests/test_kron_op.py @@ -151,7 +151,7 @@ def init_grad_input_output(self): self.grad_y = self.get_grad_y_by_numpy() def get_grad_x_by_numpy(self): - grad_x = np.zeros(self.x_shape, np.complex) + grad_x = np.zeros(self.x_shape, np.complex128) for x_i in range(self.x_shape[0]): for x_j in range(self.x_shape[1]): for i in range(self.y_shape[0]): @@ -163,7 +163,7 @@ def get_grad_x_by_numpy(self): return grad_x def get_grad_y_by_numpy(self): - grad_y = np.zeros(self.y_shape, np.complex) + grad_y = np.zeros(self.y_shape, np.complex128) for y_i in range(self.y_shape[0]): for y_j in range(self.y_shape[1]): for x_i in range(self.x_shape[0]): diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py index d29f47c8ab11d..f17bffe3b86ee 100644 --- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle +import os +import json +import tempfile import unittest +import warnings import numpy + +import paddle import paddle.nn.functional as F -import tempfile -import warnings -import json -import os from paddle.fluid.framework import _enable_legacy_dygraph _enable_legacy_dygraph() diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py index 2d6cdac4854f7..e2ed2d8003a46 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py @@ -181,10 +181,11 @@ def _collate_fn(sample_list): for i in range(10): indices_queue.put([i, i + 10]) indices_queue.put(None) + base_seed = 1234 _worker_loop(loader._dataset, 0, indices_queue, loader._data_queue, loader._workers_done_event, True, _collate_fn, True, _init_fn, 0, 1, - loader._use_shared_memory) + loader._use_shared_memory, base_seed) self.assertTrue(False) except AssertionError: pass @@ -223,10 +224,11 @@ def _collate_fn(sample_list): indices_queue.put([i, i + 10]) indices_queue.put(None) loader._workers_done_event.set() + base_seed = 1234 _worker_loop(loader._dataset, 0, indices_queue, loader._data_queue, loader._workers_done_event, True, _collate_fn, True, _init_fn, 0, 1, - loader._use_shared_memory) + loader._use_shared_memory, base_seed) self.assertTrue(True) except AssertionError: pass diff --git a/python/paddle/fluid/tests/unittests/test_ones_like.py b/python/paddle/fluid/tests/unittests/test_ones_like.py index 0c6e2476be324..31e28fe478707 100644 --- a/python/paddle/fluid/tests/unittests/test_ones_like.py +++ b/python/paddle/fluid/tests/unittests/test_ones_like.py @@ -41,7 +41,7 @@ def test_api(self): # 'bool', 'float32', 'float64', 'int32', 'int64' out1 = ones_like(x) - out2 = ones_like(x, np.bool) + out2 = ones_like(x, np.bool_) out3 = ones_like(x, 'float64') out4 = ones_like(x, 'int32') out5 = ones_like(x, 'int64') @@ -54,7 +54,7 @@ def test_api(self): fetch_list=[out1, out2, out3, out4, out5]) for i, dtype in enumerate( - [np.float32, np.bool, np.float64, np.int32, np.int64]): + [np.float32, np.bool_, np.float64, np.int32, np.int64]): self.assertEqual(outs[i].dtype, dtype) self.assertEqual((outs[i] == np.ones(shape, dtype)).all(), True) @@ -67,7 +67,7 @@ def test_out(self): 0) if core.is_compiled_with_cuda() else fluid.CPUPlace() paddle.disable_static(place) x = paddle.to_tensor(np.ones(shape)) - for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]: + for dtype in [np.bool_, np.float32, np.float64, np.int32, np.int64]: out = ones_like(x, dtype) self.assertEqual((out.numpy() == np.ones(shape, dtype)).all(), True) diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index b70b69ca97c3d..490167a8ff796 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -14,6 +14,8 @@ from __future__ import print_function +import os +import tempfile import unittest import paddle.fluid as fluid @@ -29,8 +31,6 @@ from paddle.io import Dataset import numpy -paddle.enable_static() - class TestOptimizer(unittest.TestCase): @@ -1279,6 +1279,12 @@ class TestMasterWeightSaveForFP16(unittest.TestCase): Master weights will be saved by optimizer::state_dict. ''' + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + def check_with_opt_state_dict(self, use_save_load=True): paddle.seed(100) numpy.random.seed(100) @@ -1340,10 +1346,12 @@ def __len__(self): optimizer.clear_grad(set_to_zero=False) if use_save_load and i == 5: - paddle.save(model.state_dict(), "model.pdparams") - paddle.save(optimizer.state_dict(), "opt.pdopt") - model.set_state_dict(paddle.load("model.pdparams")) - optimizer.set_state_dict(paddle.load("opt.pdopt")) + model_path = os.path.join(self.temp_dir.name, "model.pdparams") + optimizer_path = os.path.join(self.temp_dir.name, "opt.pdopt") + paddle.save(model.state_dict(), model_path) + paddle.save(optimizer.state_dict(), optimizer_path) + model.set_state_dict(paddle.load(model_path)) + optimizer.set_state_dict(paddle.load(optimizer_path)) return loss.numpy() @@ -1359,4 +1367,5 @@ def test_with_state_dict(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py index 2e18f8b748efd..b0aaaec246f67 100644 --- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py @@ -81,9 +81,9 @@ def init_test_params(self): self.set_min_max_aspect_ratios_order() self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.aspect_ratios = np.array(self.aspect_ratios, - dtype=np.float).flatten() + dtype=np.float64).flatten() self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() + self.variances = np.array(self.variances, dtype=np.float64).flatten() self.clip = True self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index d6fabb44b4fe2..8d0fcc7ae22ab 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -965,7 +965,7 @@ def test_dygraph(self): paddle.disable_static() for place in self.places: with fluid.dygraph.guard(place): - np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool) + np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool_) x = fluid.layers.assign(np_x) x = fluid.layers.cast(x, 'bool') @@ -1021,7 +1021,7 @@ def test_dygraph(self): paddle.disable_static() for place in self.places: with fluid.dygraph.guard(place): - np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool) + np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool_) x = fluid.layers.assign(np_x) x = fluid.layers.cast(x, 'bool') diff --git a/python/paddle/fluid/tests/unittests/test_signal.py b/python/paddle/fluid/tests/unittests/test_signal.py index 8257630cf2071..670b3aa40df4d 100644 --- a/python/paddle/fluid/tests/unittests/test_signal.py +++ b/python/paddle/fluid/tests/unittests/test_signal.py @@ -81,7 +81,7 @@ def normalize(S, norm=np.inf, axis=0, threshold=None, fill=None): raise Exception("Input must be finite") # All norms only depend on magnitude, let's do that first - mag = np.abs(S).astype(np.float) + mag = np.abs(S).astype(np.float64) # For max/min norms, filling with 1 works fill_norm = 1 @@ -598,8 +598,8 @@ def rand_x(dims=1, np.random.randint(min_dim_len, max_dim_len) for i in range(dims) ] if complex: - return np.random.randn(*shape).astype( - dtype) + 1.j * np.random.randn(*shape).astype(dtype) + return np.random.randn( + *shape).astype(dtype) + 1.j * np.random.randn(*shape).astype(dtype) else: return np.random.randn(*shape).astype(dtype) diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py index a2ccfa925ed81..59b652b1d3d67 100644 --- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py +++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py @@ -13,12 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import numpy as np +import tempfile +import unittest + import paddle import paddle.fluid as fluid -import unittest import paddle.nn as nn -import os class SimpleFCLayer(nn.Layer): @@ -54,6 +56,10 @@ def setUp(self): self.fc_size = 2 self.layer = self._train_simple_net() self.type_str = 'class' + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() def test_trace_err(self): if fluid.framework.in_dygraph_mode(): @@ -122,7 +128,7 @@ def test_save_inference_model_err(self): dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace( self.layer, [in_x]) - path = './traced_layer_err_msg' + path = os.path.join(self.temp_dir.name, './traced_layer_err_msg') with self.assertRaises(TypeError) as e: traced_layer.save_inference_model([0]) self.assertEqual( @@ -193,11 +199,15 @@ class TestTracedLayerSaveInferenceModel(unittest.TestCase): """test save_inference_model will automaticlly create non-exist dir""" def setUp(self): - self.save_path = "./nonexist_dir/fc" + self.temp_dir = tempfile.TemporaryDirectory() + self.save_path = os.path.join(self.temp_dir.name, "./nonexist_dir/fc") import shutil if os.path.exists(os.path.dirname(self.save_path)): shutil.rmtree(os.path.dirname(self.save_path)) + def tearDown(self): + self.temp_dir.cleanup() + def test_mkdir_when_input_path_non_exist(self): if fluid.framework.in_dygraph_mode(): return diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py index c1294628a4e71..0d31dad81997e 100644 --- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py +++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py @@ -24,7 +24,7 @@ class TestUpdateLossScalingOp(OpTest): def setUp(self): self.op_type = "update_loss_scaling" self.init() - found_inf = np.array([False], dtype=np.bool) + found_inf = np.array([False], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) self.inputs = { @@ -66,7 +66,7 @@ class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): def setUp(self): self.op_type = "update_loss_scaling" self.init() - found_inf = np.array([True], dtype=np.bool) + found_inf = np.array([True], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) i = np.random.randint(0, 1024, 1) j = np.random.randint(0, 1024, 1) diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py index 3be1fb85565f7..13911dff01601 100644 --- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py @@ -43,7 +43,7 @@ def test_api(self): with program_guard(train_program, startup_program): x = paddle.fluid.data('X', shape) out1 = zeros_like(x) - out2 = zeros_like(x, np.bool) + out2 = zeros_like(x, np.bool_) out3 = zeros_like(x, 'float64') out4 = zeros_like(x, 'int32') out5 = zeros_like(x, 'int64') @@ -54,7 +54,7 @@ def test_api(self): feed={'X': np.ones(shape).astype('float32')}, fetch_list=[out1, out2, out3, out4, out5]) for (i, dtype) in enumerate( - [np.float32, np.bool, np.float64, np.int32, np.int64]): + [np.float32, np.bool_, np.float64, np.int32, np.int64]): self.assertEqual(outs[i].dtype, dtype) self.assertEqual((outs[i] == np.zeros(shape, dtype)).all(), True) @@ -71,7 +71,7 @@ def test_out(self): if core.is_compiled_with_cuda() else fluid.CPUPlace()) paddle.disable_static(place) x = paddle.to_tensor(np.ones(shape)) - for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]: + for dtype in [np.bool_, np.float32, np.float64, np.int32, np.int64]: out = zeros_like(x, dtype) self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(), True) diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt index 233c4e6143615..6267526f33c12 100644 --- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt @@ -21,11 +21,11 @@ list(REMOVE_ITEM TEST_OPS test_mean_op_xpu) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() foreach(TEST_OP ${DIST_TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) -endforeach(TEST_OP) +endforeach() set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120) set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py index a4175ec25cf1b..f33da83bae7a1 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py @@ -240,8 +240,8 @@ def test_bool_api_4(self): op = eval("paddle.%s" % (self.op_type)) out = op(x, y) exe = paddle.static.Executor(self.place) - input_x = np.array([True, False, True]).astype(np.bool) - input_y = np.array([True, True, False]).astype(np.bool) + input_x = np.array([True, False, True]).astype(np.bool_) + input_y = np.array([True, True, False]).astype(np.bool_) real_result = callback(input_x, input_y) res, = exe.run(feed={ "x": input_x, @@ -258,8 +258,8 @@ def test_bool_broadcast_api_4(self): op = eval("paddle.%s" % (self.op_type)) out = op(x, y) exe = paddle.static.Executor(self.place) - input_x = np.array([True, False, True]).astype(np.bool) - input_y = np.array([True]).astype(np.bool) + input_x = np.array([True, False, True]).astype(np.bool_) + input_y = np.array([True]).astype(np.bool_) real_result = callback(input_x, input_y) res, = exe.run(feed={ "x": input_x, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py index c8fcffbd3d33d..32dd28f73851d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py @@ -98,9 +98,10 @@ def init_test_params(self): self.set_min_max_aspect_ratios_order() self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] self.aspect_ratios = np.array(self.aspect_ratios, - dtype=np.float).flatten() + dtype=np.float64).flatten() self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() + self.variances = np.array(self.variances, + dtype=np.float64).flatten() self.clip = True self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py index 0aecc48fe3506..5ed10d159ae05 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py @@ -31,7 +31,7 @@ class TestUpdateLossScalingOp(XPUOpTest): def setUp(self): self.op_type = "update_loss_scaling" self.init() - found_inf = np.array([False], dtype=np.bool) + found_inf = np.array([False], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) self.inputs = { @@ -75,7 +75,7 @@ class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): def setUp(self): self.op_type = "update_loss_scaling" self.init() - found_inf = np.array([True], dtype=np.bool) + found_inf = np.array([True], dtype=np.bool_) x = np.random.random((1024, 1024)).astype(self.dtype) i = np.random.randint(0, 1024, 1) j = np.random.randint(0, 1024, 1) diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index ab7e135adc6c4..87c38a46692de 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -45,6 +45,7 @@ def fused_feedforward(x, pre_layer_norm=False, training=True, mode='upscale_in_train', + ring_id=-1, name=None): r""" This is a fusion operator to compute feed forward layer in transformer model architecture. @@ -88,6 +89,7 @@ def fused_feedforward(x, - train: out = input * mask - inference: out = input * (1.0 - p) + ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using tensor parallel. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -132,7 +134,8 @@ def fused_feedforward(x, "dropout1_fix_seed", seed is not None, "dropout2_fix_seed", seed is not None, "dropout1_seed", seed if seed is not None else 0, "dropout2_seed", seed if seed is not None else 0, - 'dropout1_implementation', mode, 'dropout2_implementation', mode) + 'dropout1_implementation', mode, 'dropout2_implementation', mode, + 'ring_id', ring_id) return out helper = LayerHelper("fused_feedforward") @@ -206,7 +209,8 @@ def fused_feedforward(x, 'dropout1_seed': seed if seed is not None else 0, 'dropout2_seed': seed if seed is not None else 0, 'dropout1_implementation': mode, - 'dropout2_implementation': mode + 'dropout2_implementation': mode, + 'ring_id': ring_id, }) return out diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py index 595b1d27fea8b..f52cbd2cd3ef4 100644 --- a/python/paddle/incubate/nn/layer/fused_transformer.py +++ b/python/paddle/incubate/nn/layer/fused_transformer.py @@ -101,12 +101,12 @@ def forward(self, x, residual): Applies fused_bias_dropout_residual_layer_norm operation. Parameters: - x (Tensor): The input tensor. It is a tensor with shape - `[batch_size, seq_len, embed_dim]`. The data type should be - float32 or float64. - residual (Tensor, optional): The residual tensor. It is a tensor - with shape `[batch_size, value_length, vdim]`. The data type - should be float32 or float64. + x (Tensor): The input tensor. It is a tensor with shape + `[batch_size, seq_len, embed_dim]`. The data type should be + float32 or float64. + residual (Tensor, optional): The residual tensor. It is a tensor + with shape `[batch_size, value_length, vdim]`. The data type + should be float32 or float64. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ @@ -158,15 +158,39 @@ class FusedMultiHeadAttention(Layer): (True) or post_layer_norm architecture (False). Default False. need_weights (bool, optional): Indicate whether to return the attention weights. Now, only False is supported. Default False. - weight_attr(ParamAttr, optional): To specify the weight parameter property. - Default: None, which means the default weight parameter property is used. - See usage for details in :code:`ParamAttr`. - bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. - Default: None, which means the default bias parameter property is used. - If it is set to False, this layer will not have trainable bias parameter. - See usage for details in :code:`ParamAttr`. + qkv_weight_attr(ParamAttr, optional): To specify the weight parameter property + for QKV projection computation. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + qkv_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for QKV projection computation. The `False` value means the corresponding layer + would not have trainable bias parameter. Default: None, which means the + default bias parameter property is used. See usage for details in :code:`ParamAttr`. + linear_weight_attr(ParamAttr, optional): To specify the weight parameter property + for linear projection computation. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + linear_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for linear projection computation. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. + pre_ln_scale_attr(ParamAttr, optional): To specify the weight parameter property + for pre_layer_norm computation. Otherwise, all layers both use it as + `attr` to create parameters. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + pre_ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for pre_layer_norm computation. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. + ln_scale_attr(ParamAttr, optional): To specify the weight parameter property + for post_layer_norm computation. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + ln_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for post_layer_norm computation. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. epsilon (float, optional): The small value added to the variance to prevent division by zero. Default: 1e-05. + nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel. + ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel. Examples: @@ -191,9 +215,17 @@ def __init__(self, vdim=None, normalize_before=False, need_weights=False, - weight_attr=None, - bias_attr=None, + qkv_weight_attr=None, + qkv_bias_attr=None, + linear_weight_attr=None, + linear_bias_attr=None, + pre_ln_scale_attr=None, + pre_ln_bias_attr=None, + ln_scale_attr=None, + ln_bias_attr=None, epsilon=1e-5, + nranks=1, + ring_id=-1, name=None): super(FusedMultiHeadAttention, self).__init__() @@ -204,9 +236,8 @@ def __init__(self, self.normalize_before = normalize_before self._dtype = self._helper.get_default_dtype() - self._weight_attr = weight_attr - self._bias_attr = bias_attr self._epsilon = epsilon + self._ring_id = ring_id self.embed_dim = embed_dim self.num_heads = num_heads @@ -215,41 +246,61 @@ def __init__(self, self.vdim = vdim self.need_weights = need_weights assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" - assert need_weights == False, "Only support need_weight is False now." + assert need_weights is False, "Only support need_weight is False now." + + # tensor model parallel + assert num_heads % nranks == 0 + num_heads = num_heads // nranks self.qkv_weight = self.create_parameter( shape=[3, num_heads, self.head_dim, embed_dim], - attr=self._weight_attr, + attr=qkv_weight_attr, dtype=self._dtype, is_bias=False) self.qkv_bias = self.create_parameter( shape=[3, num_heads, self.head_dim], - attr=self._bias_attr, + attr=qkv_bias_attr, dtype=self._dtype, is_bias=True) - self.linear_weight = self.create_parameter(shape=[embed_dim, embed_dim], - attr=self._weight_attr, - dtype=self._dtype, - is_bias=False) + self.linear_weight = self.create_parameter( + shape=[num_heads * self.head_dim, embed_dim], + attr=linear_weight_attr, + dtype=self._dtype, + is_bias=False) self.linear_bias = self.create_parameter(shape=[embed_dim], - attr=self._bias_attr, + attr=linear_bias_attr, dtype=self._dtype, is_bias=True) - self.pre_ln_scale = self.create_parameter( - attr=self._weight_attr, - shape=[embed_dim], - default_initializer=Constant(value=1.0)) - self.pre_ln_bias = self.create_parameter(attr=self._bias_attr, + # tensor model parallel + if nranks > 1: + assert ring_id != -1 + # column parallel + _set_var_distributed(self.qkv_weight) + _set_var_distributed(self.qkv_bias) + # row parallel + _set_var_distributed(self.linear_weight) + + if normalize_before: + self.pre_ln_scale = self.create_parameter( + attr=pre_ln_scale_attr, + shape=[embed_dim], + default_initializer=Constant(value=1.0)) + self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr, + shape=[embed_dim], + is_bias=True) + self.ln_scale = None + self.ln_bias = None + else: + self.pre_ln_scale = None + self.pre_ln_bias = None + self.ln_scale = self.create_parameter( + attr=ln_scale_attr, + shape=[embed_dim], + default_initializer=Constant(value=1.0)) + self.ln_bias = self.create_parameter(attr=ln_bias_attr, shape=[embed_dim], is_bias=True) - self.ln_scale = self.create_parameter( - attr=self._weight_attr, - shape=[embed_dim], - default_initializer=Constant(value=1.0)) - self.ln_bias = self.create_parameter(attr=self._bias_attr, - shape=[embed_dim], - is_bias=True) self.dropout_rate = dropout_rate self.attn_dropout_rate = attn_dropout_rate @@ -294,8 +345,6 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None): # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, query.dtype) - assert cache == None, "Only support cache is None now." - out = incubate_f.fused_multi_head_attention( x=query, qkv_weight=self.qkv_weight, @@ -308,11 +357,13 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None): pre_ln_epsilon=self._epsilon, qkv_bias=self.qkv_bias, linear_bias=self.linear_bias, + cache_kv=cache, attn_mask=attn_mask, dropout_rate=self.dropout_rate, attn_dropout_rate=self.attn_dropout_rate, ln_epsilon=self._epsilon, training=self.training, + ring_id=self._ring_id, name=self.name) return out @@ -338,14 +389,38 @@ class FusedFeedForward(Layer): If None, use the value of `dropout_rate`. Default None normalize_before (bool, optional): Indicate whether to put layer normalization into, preprocessing or postprocessing. Default False - weight_attr (ParamAttr, optional): The attribute for the learnable weight of this layer. - The default value is None and the weight will be initialized to zero. For detailed - information, please refer to paddle.ParamAttr. - bias_attr (ParamAttr|bool, optional): The attribute for the learnable bias of thi layer. - If it is set to False, no bias will be added to the output. If it is set to None or one - kind of ParamAttr, a bias parameter will be created according to ParamAttr. For detailed - information, please refer to paddle.ParamAttr. The default value is None and the bias - will be initialized to zero. + linear1_weight_attr(ParamAttr, optional): To specify the weight parameter property + for FFN first linear. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + linear1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for FFN first linear. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. + linear2_weight_attr(ParamAttr, optional): To specify the weight parameter property + for FFN second linear. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + linear2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for FFN second linear. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. + ln1_scale_attr(ParamAttr, optional): To specify the weight parameter property + for FFN pre_layer_norm. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + ln1_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for FFN pre_layer_norm. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. + ln2_scale_attr(ParamAttr, optional): To specify the weight parameter property + for FFN post_layer_norm. Default: None, which means the default weight + parameter property is used. See usage for details in :code:`ParamAttr`. + ln2_bias_attr(ParamAttr|bool, optional): To specify the bias parameter property + for FFN layer_norm. The `False` value means the corresponding layer would + not have trainable bias parameter. Default: None, which means the default bias + parameter property is used. See usage for details in :code:`ParamAttr`. + nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using tensor parallel. + ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using tensor parallel. + name (str, optional): The default value is None. Normally there is no need for user to set + this property. For more information, please refer to :ref:`api_guide_Name`. Examples: .. code-block:: python @@ -369,8 +444,16 @@ def __init__(self, activation="relu", act_dropout_rate=None, normalize_before=False, - weight_attr=None, - bias_attr=None, + linear1_weight_attr=None, + linear1_bias_attr=None, + linear2_weight_attr=None, + linear2_bias_attr=None, + ln1_scale_attr=None, + ln1_bias_attr=None, + ln2_scale_attr=None, + ln2_bias_attr=None, + nranks=1, + ring_id=-1, name=None): super(FusedFeedForward, self).__init__() @@ -383,51 +466,68 @@ def __init__(self, self._dtype = self._helper.get_default_dtype() self._d_model = d_model + + assert dim_feedforward % nranks == 0 + dim_feedforward = dim_feedforward // nranks self._dim_feedforward = dim_feedforward self._dropout_rate = dropout_rate self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate self._act_method = activation self._normalize_before = normalize_before self._epsilon = epsilon + self._ring_id = ring_id self._linear1_weight = self.create_parameter( shape=[d_model, dim_feedforward], - attr=weight_attr, + attr=linear1_weight_attr, dtype=self._dtype, is_bias=False) self._linear1_bias = self.create_parameter(shape=[dim_feedforward], - attr=bias_attr, + attr=linear1_bias_attr, dtype=self._dtype, is_bias=True) self._linear2_weight = self.create_parameter( shape=[dim_feedforward, d_model], - attr=weight_attr, + attr=linear2_weight_attr, dtype=self._dtype, is_bias=False) self._linear2_bias = self.create_parameter(shape=[d_model], - attr=bias_attr, + attr=linear2_bias_attr, dtype=self._dtype, is_bias=True) - self._ln1_scale = self.create_parameter( - shape=[d_model], - attr=None, - is_bias=False, - default_initializer=Constant(1.0)) - self._ln1_bias = self.create_parameter(shape=[d_model], - attr=None, - is_bias=True) - - self._ln2_scale = self.create_parameter( - shape=[d_model], - attr=None, - is_bias=False, - default_initializer=Constant(1.0)) - self._ln2_bias = self.create_parameter(shape=[d_model], - attr=None, - is_bias=True) + if nranks > 1: + assert ring_id != -1 + # column parallel + _set_var_distributed(self._linear1_weight) + _set_var_distributed(self._linear1_bias) + _set_var_distributed(self._linear2_weight) + + if normalize_before: + self._ln1_scale = self.create_parameter( + shape=[d_model], + attr=ln1_scale_attr, + is_bias=False, + default_initializer=Constant(1.0)) + self._ln1_bias = self.create_parameter(shape=[d_model], + attr=ln1_bias_attr, + is_bias=True) + self._ln2_scale = None + self._ln2_bias = None + else: + self._ln1_scale = None + self._ln1_bias = None + self._ln2_scale = self.create_parameter( + shape=[d_model], + attr=ln2_scale_attr, + is_bias=False, + default_initializer=Constant(1.0)) + self._ln2_bias = self.create_parameter(shape=[d_model], + attr=ln2_bias_attr, + is_bias=True) + self.name = name def forward(self, src, cache=None): @@ -448,6 +548,7 @@ def forward(self, src, cache=None): ln2_epsilon=self._epsilon, pre_layer_norm=self._normalize_before, training=self.training, + ring_id=self._ring_id, name=self.name) return out @@ -553,8 +654,14 @@ def __init__(self, dropout_rate=dropout_rate, attn_dropout_rate=attn_dropout_rate, normalize_before=self.normalize_before, - weight_attr=weight_attrs[0], - bias_attr=bias_attrs[0]) + qkv_weight_attr=weight_attrs[0], + qkv_bias_attr=bias_attrs[0], + linear_weight_attr=weight_attrs[0], + linear_bias_attr=bias_attrs[0], + pre_ln_scale_attr=weight_attrs[0], + pre_ln_bias_attr=bias_attrs[0], + ln_scale_attr=weight_attrs[0], + ln_bias_attr=bias_attrs[0]) self.ffn = FusedFeedForward(d_model, dim_feedforward, @@ -562,8 +669,10 @@ def __init__(self, activation=activation, act_dropout_rate=act_dropout_rate, normalize_before=self.normalize_before, - weight_attr=weight_attrs[1], - bias_attr=bias_attrs[1]) + linear1_weight_attr=weight_attrs[1], + linear1_bias_attr=bias_attrs[1], + linear2_weight_attr=weight_attrs[1], + linear2_bias_attr=bias_attrs[1]) def forward(self, src, src_mask=None, cache=None): """ diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py index 292eaff362b40..66818dab451a3 100644 --- a/python/paddle/nn/initializer/constant.py +++ b/python/paddle/nn/initializer/constant.py @@ -22,11 +22,11 @@ class Constant(ConstantInitializer): """Implement the constant initializer. Args: - value (float32): constant value to initialize the parameter + value (float32|float64, optional): constant value to initialize the parameter. Default: 0.0. Examples: .. code-block:: python - + :name: code-example1 import paddle import paddle.nn as nn diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py index b8ed7febb6bc7..456496571924e 100644 --- a/python/paddle/nn/initializer/kaiming.py +++ b/python/paddle/nn/initializer/kaiming.py @@ -36,7 +36,7 @@ class KaimingNormal(MSRAInitializer): \sqrt{\frac{2.0}{fan\_in}} Args: - fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\ + fan_in (float32|None, optional): fan_in for Kaiming normal Initializer. If None, it is inferred from the variable. default is None. Note: @@ -44,7 +44,7 @@ class KaimingNormal(MSRAInitializer): Examples: .. code-block:: python - + :name: code-example1 import paddle import paddle.nn as nn @@ -79,7 +79,7 @@ class KaimingUniform(MSRAInitializer): x = \sqrt{\frac{6.0}{fan\_in}} Args: - fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ + fan_in (float32|None, optional): fan_in for Kaiming uniform Initializer. If None, it is inferred from the variable. default is None. Note: @@ -87,7 +87,7 @@ class KaimingUniform(MSRAInitializer): Examples: .. code-block:: python - + :name: code-example1 import paddle import paddle.nn as nn diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py index f724f7cfee52c..4ef987eccf2a4 100644 --- a/python/paddle/nn/layer/conv.py +++ b/python/paddle/nn/layer/conv.py @@ -76,7 +76,7 @@ def __init__(self, format(valid_padding_modes, padding_mode)) if padding_mode in {'reflect', 'replicate', 'circular' - } and not isinstance(padding, np.int): + } and not isinstance(padding, int): raise TypeError( "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int" ) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 521839af902b5..9971a4d5a3e18 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -352,7 +352,7 @@ def _handle_dtype(data, dtype): data = np.array([data]) elif isinstance(data, (list, tuple)): data = np.array(data) - if data.dtype == np.object: + if data.dtype == np.object_: raise ValueError( "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually " "this means the input data contains nested lists with different lengths. " @@ -962,7 +962,7 @@ def tril(x, diagonal=0, name=None): def triu(x, diagonal=0, name=None): r""" - This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices + Return the upper triangular part of a matrix (2-D tensor) or batch of matrices :attr:`x`, the other elements of the result tensor are set to 0. The upper triangular part of the matrix is defined as the elements on and above the diagonal. diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 0cdced2cf9b84..34a1ead2cb497 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -807,9 +807,9 @@ def gen_einsum_op(equation, *operands): if _in_legacy_dygraph(): # dygraph - return _C_ops.einsum(operands, len(operands), 'equation', equation)[0] + return _C_ops.einsum(operands, len(operands), len(operands), 'equation', + equation)[0] - # static graph for inp in operands: check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum') check_type(equation, 'equation', str, 'einsum') @@ -821,11 +821,16 @@ def gen_einsum_op(equation, *operands): helper.create_variable_for_type_inference(dtype=operands[0].dtype) for i in range(len(operands)) ] + xshape = [ + helper.create_variable_for_type_inference(dtype=operands[0].dtype) + for i in range(len(operands)) + ] helper.append_op(type='einsum', inputs={'Operands': operands}, outputs={ 'Out': out, - "InnerCache": caches + "InnerCache": caches, + "XShape": xshape }, attrs=attrs) return out diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 0089ef21dc98a..137c85ac98938 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -430,7 +430,7 @@ def inf_norm(input, reduce_all = True if axis == None or axis == [] or asvector == True else False axis = axis if axis != None and axis != [] else [0] - reduce_type = 'reduce_max' if porder == np.float( + reduce_type = 'reduce_max' if porder == np.float64( 'inf') else 'reduce_min' helper.append_op(type=reduce_type, inputs={'X': out}, diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index c4b4c552c670d..8834ae1d400f1 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -146,8 +146,8 @@ def logical_or(x, y, out=None, name=None): import paddle import numpy as np - x_data = np.array([True, False], dtype=np.bool).reshape(2, 1) - y_data = np.array([True, False, True, False], dtype=np.bool).reshape(2, 2) + x_data = np.array([True, False], dtype=np.bool_).reshape(2, 1) + y_data = np.array([True, False, True, False], dtype=np.bool_).reshape(2, 2) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) res = paddle.logical_or(x, y) @@ -191,8 +191,8 @@ def logical_xor(x, y, out=None, name=None): import paddle import numpy as np - x_data = np.array([True, False], dtype=np.bool).reshape([2, 1]) - y_data = np.array([True, False, True, False], dtype=np.bool).reshape([2, 2]) + x_data = np.array([True, False], dtype=np.bool_).reshape([2, 1]) + y_data = np.array([True, False, True, False], dtype=np.bool_).reshape([2, 2]) x = paddle.to_tensor(x_data) y = paddle.to_tensor(y_data) res = paddle.logical_xor(x, y) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 96d24a7f915ee..c445402412e16 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1887,7 +1887,7 @@ def _get_SectionsTensorList(one_list): def squeeze(x, axis=None, name=None): """ - This OP will squeeze the dimension(s) of size 1 of input tensor x's shape. + Squeeze the dimension(s) of size 1 of input tensor x's shape. Note that the output Tensor will share data with origin Tensor and doesn't have a Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, @@ -1944,7 +1944,7 @@ def squeeze(x, axis=None, name=None): Examples: .. code-block:: python - + :name: code-example1 import paddle x = paddle.rand([5, 1, 10]) @@ -2139,13 +2139,13 @@ def unique(x, :ref:`api_guide_Name`. Default: None. Returns: - tuple: (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \ + tuple (out, indices, inverse, counts). `out` is the unique tensor for `x`. `indices` is \ provided only if `return_index` is True. `inverse` is provided only if `return_inverse` \ is True. `counts` is provided only if `return_counts` is True. Examples: .. code-block:: python - + :name: code-example1 import paddle x = paddle.to_tensor([2, 3, 3, 1, 5, 3]) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index ffca233ff16bf..1cb350f4d7288 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1319,7 +1319,7 @@ def nanmean(x, axis=None, keepdim=False, name=None): @templatedoc(op_type="sum") def add_n(inputs, name=None): """ - This OP is used to sum one or more Tensor of the input. + Sum one or more Tensor of the input. For example: @@ -1365,7 +1365,7 @@ def add_n(inputs, name=None): Examples: .. code-block:: python - + :name: code-example1 import paddle input0 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], dtype='float32') diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index f43bda1129589..990b20a26772c 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -631,13 +631,13 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None): If ``high`` is None (the default), the range is [0, ``low``). Args: - low (int): The lower bound on the range of random values to generate. + low (int, optional): The lower bound on the range of random values to generate. The ``low`` is included in the range. If ``high`` is None, the range is [0, ``low``). Default is 0. high (int, optional): The upper bound on the range of random values to generate, the ``high`` is excluded in the range. Default is None (see above for behavior if high = None). Default is None. - shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape`` + shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape`` is a list or tuple, the elements of it should be integers or Tensors (with the shape [1], and the data type int32 or int64). If ``shape`` is a Tensor, it should be a 1-D Tensor(with the data type int32 or diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 94a05294aaa63..f46b53a3b71f4 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -207,7 +207,7 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None): def argmin(x, axis=None, keepdim=False, dtype="int64", name=None): """ - This OP computes the indices of the min elements of the input tensor's + Computing the indices of the min elements of the input tensor's element along the provided axis. Args: @@ -217,7 +217,7 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None): is [-R, R), where R is x.ndim. when axis < 0, it works the same way as axis + R. Default is None, the input `x` will be into the flatten tensor, and selecting the min value index. keepdim(bool, optional): Whether to keep the given axis in output. If it is True, the dimensions will be same as input x and with size one in the axis. Otherwise the output dimentions is one fewer than x since the axis is squeezed. Default is False. - dtype(str): Data type of the output tensor which can + dtype(str, optional): Data type of the output tensor which can be int32, int64. The default value is 'int64', and it will return the int64 indices. name(str, optional): The default value is None. Normally there is no @@ -225,11 +225,11 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None): refer to :ref:`api_guide_Name`. Returns: - Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64` + Tensor, return the tensor of `int32` if set :attr:`dtype` is `int32`, otherwise return the tensor of `int64`. Examples: .. code-block:: python - + :name: code-example1 import paddle x = paddle.to_tensor([[5,8,9,5], @@ -834,7 +834,7 @@ def masked_select(x, mask, name=None): def topk(x, k, axis=None, largest=True, sorted=True, name=None): """ - This OP is used to find values and indices of the k largest or smallest at the optional axis. + Return values and indices of the k largest or smallest at the optional axis. If the input is a 1-D Tensor, finds the k largest or smallest values and indices. If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`. @@ -856,35 +856,27 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None): Examples: .. code-block:: python + :name: code-example1 + import paddle - import paddle + data_1 = paddle.to_tensor([1, 4, 5, 7]) + value_1, indices_1 = paddle.topk(data_1, k=1) + print(value_1) # [7] + print(indices_1) # [3] + + data_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]]) + value_2, indices_2 = paddle.topk(data_2, k=1) + print(value_2) # [[7], [6]] + print(indices_2) # [[3], [1]] + + value_3, indices_3 = paddle.topk(data_2, k=1, axis=-1) + print(value_3) # [[7], [6]] + print(indices_3) # [[3], [1]] + + value_4, indices_4 = paddle.topk(data_2, k=1, axis=0) + print(value_4) # [[2, 6, 5, 7]] + print(indices_4) # [[1, 1, 0, 0]] - tensor_1 = paddle.to_tensor([1, 4, 5, 7]) - value_1, indices_1 = paddle.topk(tensor_1, k=1) - print(value_1) - # [7] - print(indices_1) - # [3] - tensor_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]]) - value_2, indices_2 = paddle.topk(tensor_2, k=1) - print(value_2) - # [[7] - # [6]] - print(indices_2) - # [[3] - # [1]] - value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1) - print(value_3) - # [[7] - # [6]] - print(indices_3) - # [[3] - # [1]] - value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0) - print(value_4) - # [[2 6 5 7]] - print(indices_4) - # [[1 1 0 0]] """ diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 2b8cff3543e76..58c9ea6e5d2e8 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -603,7 +603,7 @@ - api : einsum args : (Tensor[] x, str equation) - output : Tensor, Tensor[]{x.size()} + output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()} infer_meta : func : EinsumInferMeta param : [x, equation] diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 8e20b05110e71..2cdf22beeed96 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1,3 +1,14 @@ +#- backward_api : einsum_grad + + #forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache) + #args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation) + #output : Tensor[](x_grad){x.size()} + #infer_meta : + #func : UnchangedMultiInferMeta + #param : [x] + #kernel : + #func : einsum_grad + - backward_api : abs_double_grad forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x) args : (Tensor x, Tensor grad_x_grad) @@ -616,12 +627,12 @@ skip_transform : out_w, out_w_grad - backward_api : einsum_grad - forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache) - args : (Tensor[] x, Tensor[] inner_cache, Tensor out_grad, str equation) + forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache), Tensor[](x_shape) + args : (Tensor[] x_shape, Tensor[] inner_cache, Tensor out_grad, str equation) output : Tensor[](x_grad){x.size()} infer_meta : func : UnchangedMultiInferMeta - param : [x] + param : [x_shape] kernel : func : einsum_grad diff --git a/tools/codestyle/pre_commit.sh b/tools/codestyle/pre_commit.sh new file mode 100755 index 0000000000000..7ea8a1658da26 --- /dev/null +++ b/tools/codestyle/pre_commit.sh @@ -0,0 +1,44 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set +x + +# use pre-commit 2.17 +if ! [[ $(pre-commit --version) == *"2.17.0"* ]]; then + pip install pre-commit==2.17.0 1>nul +fi + +diff_files=$(git diff --numstat ${BRANCH} | awk '{print $NF}') +echo -e "diff files between pr and ${BRANCH}:\n${diff_files}" + +echo "Checking code style by pre-commit ..." +pre-commit run --files ${diff_files};check_error=$? + +if test ! -z "$(git diff)"; then + echo -e '\n************************************************************************************' + echo -e "These files have been formated by code format hook. You should use pre-commit to \ +format them before git push." + echo -e '************************************************************************************\n' + git diff 2>&1 +fi + +echo -e '\n***********************************' +if [ ${check_error} != 0 ];then + echo "Your PR code style check failed." +else + echo "Your PR code style check passed." +fi +echo -e '***********************************\n' + +exit ${check_error} diff --git a/tools/infer_prune_patches/analysis_predictor.cc.patch b/tools/infer_prune_patches/analysis_predictor.cc.patch new file mode 100644 index 0000000000000..21fa24dd3d957 --- /dev/null +++ b/tools/infer_prune_patches/analysis_predictor.cc.patch @@ -0,0 +1,31 @@ +diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc +index 0645af611b..6b05a7fffb 100644 +--- a/paddle/fluid/inference/api/analysis_predictor.cc ++++ b/paddle/fluid/inference/api/analysis_predictor.cc +@@ -1923,7 +1923,7 @@ USE_TRT_CONVERTER(shuffle_channel); + USE_TRT_CONVERTER(swish); + USE_TRT_CONVERTER(group_norm); + USE_TRT_CONVERTER(instance_norm); +-USE_TRT_CONVERTER(layer_norm); ++//USE_TRT_CONVERTER(layer_norm); + USE_TRT_CONVERTER(gelu); + USE_TRT_CONVERTER(multihead_matmul); + USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm); +@@ -1933,13 +1933,13 @@ USE_TRT_CONVERTER(scale); + USE_TRT_CONVERTER(stack); + USE_TRT_CONVERTER(clip); + USE_TRT_CONVERTER(gather); +-USE_TRT_CONVERTER(anchor_generator); ++//USE_TRT_CONVERTER(anchor_generator); + USE_TRT_CONVERTER(yolo_box); + USE_TRT_CONVERTER(yolo_box_head); + USE_TRT_CONVERTER(arg_max); +-USE_TRT_CONVERTER(roi_align); +-USE_TRT_CONVERTER(affine_channel); +-USE_TRT_CONVERTER(multiclass_nms); ++//USE_TRT_CONVERTER(roi_align); ++//USE_TRT_CONVERTER(affine_channel); ++//USE_TRT_CONVERTER(multiclass_nms); + USE_TRT_CONVERTER(multiclass_nms3); + USE_TRT_CONVERTER(nearest_interp); + USE_TRT_CONVERTER(nearest_interp_v2); diff --git a/tools/infer_prune_patches/analyzer.cc.patch b/tools/infer_prune_patches/analyzer.cc.patch new file mode 100644 index 0000000000000..59a7b4d6b8c2e --- /dev/null +++ b/tools/infer_prune_patches/analyzer.cc.patch @@ -0,0 +1,14 @@ +diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc +index be7d6ab868..498e09cb4d 100644 +--- a/paddle/fluid/inference/analysis/analyzer.cc ++++ b/paddle/fluid/inference/analysis/analyzer.cc +@@ -32,6 +32,9 @@ void Analyzer::RunAnalysis(Argument *argument) { + "analsis_passes is not valid in the argument.")); + const bool disable_logs = argument->disable_logs(); + for (auto &pass : argument->analysis_passes()) { ++ if (pass == "ir_params_sync_among_devices_pass") { ++ continue; ++ } + if (!disable_logs) { + string::PrettyLogH1("--- Running analysis [%s]", pass); + } diff --git a/tools/infer_prune_patches/device_context.cc.patch b/tools/infer_prune_patches/device_context.cc.patch new file mode 100644 index 0000000000000..75be9a0d1d997 --- /dev/null +++ b/tools/infer_prune_patches/device_context.cc.patch @@ -0,0 +1,46 @@ +diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc +index 904e4854ba..4f8c955d8c 100644 +--- a/paddle/fluid/platform/device_context.cc ++++ b/paddle/fluid/platform/device_context.cc +@@ -466,15 +466,15 @@ CUDAContext::CUDAContext(const CUDAPlace& place, + place_ = place; + CUDADeviceGuard guard(place_.device); + stream_.reset(new stream::CUDAStream(place, priority, flag)); +- InitEigenContext(); +- InitCuBlasContext(); +- InitCuDNNContext(); ++ //InitEigenContext(); ++ //InitCuBlasContext(); ++ //InitCuDNNContext(); + #ifndef PADDLE_WITH_HIP + #if CUDA_VERSION >= 11060 +- InitCuBlasLtContext(); ++ //InitCuBlasLtContext(); + #endif +- InitCuSparseContext(); +- InitCuSolverContext(); ++ //InitCuSparseContext(); ++ //InitCuSolverContext(); + #endif + } + +@@ -506,14 +506,14 @@ void CUDAContext::SetStream(gpuStream_t stream) { + + CUDAContext::~CUDAContext() { + CUDADeviceGuard guard(place_.device); +- DestoryCuDNNContext(); +- DestoryCuBlasContext(); ++ //DestoryCuDNNContext(); ++ //DestoryCuBlasContext(); + #ifndef PADDLE_WITH_HIP + #if CUDA_VERSION >= 11060 +- InitCuBlasLtContext(); ++ //InitCuBlasLtContext(); + #endif +- DestoryCuSparseContext(); +- DestoryCuSolverContext(); ++ //DestoryCuSparseContext(); ++ //DestoryCuSolverContext(); + #endif + } + diff --git a/tools/infer_prune_patches/jitcode.h.patch b/tools/infer_prune_patches/jitcode.h.patch new file mode 100644 index 0000000000000..9022b459db51c --- /dev/null +++ b/tools/infer_prune_patches/jitcode.h.patch @@ -0,0 +1,15 @@ +diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h +index 23650c8efc..24466e4327 100644 +--- a/paddle/fluid/operators/jit/gen/jitcode.h ++++ b/paddle/fluid/operators/jit/gen/jitcode.h +@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { + } + ret(); + } +- void L(const char* label) { Xbyak::CodeGenerator::L(label); } +- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT ++ void L(const char* label) { } ++ void L(Xbyak::Label& label) { } // NOLINT + // Enhanced vector extension + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, + bool bcast = false) { diff --git a/tools/infer_prune_patches/op_registry.h.patch b/tools/infer_prune_patches/op_registry.h.patch new file mode 100644 index 0000000000000..a1d2a66347cc4 --- /dev/null +++ b/tools/infer_prune_patches/op_registry.h.patch @@ -0,0 +1,215 @@ +diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h +index a1f07f9f25..179df3b981 100644 +--- a/paddle/fluid/framework/op_registry.h ++++ b/paddle/fluid/framework/op_registry.h +@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor { + RegisterKernelClass( + op_type, library_type, customized_type_value, + +- [op_type](const framework::ExecutionContext& ctx) { ++ [](const framework::ExecutionContext& ctx) { + KERNEL_TYPE().Compute(ctx); +- CheckKernelLaunch(op_type); + }); + constexpr auto size = std::tuple_size>::value; + OpKernelRegistrarFunctor +@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx( +- op_type, library_type, customized_type_value, +- +- [op_type](const framework::ExecutionContext& ctx) { +- Functor()(ctx); +- CheckKernelLaunch(op_type); +- }); ++ RegisterKernelClass(op_type, library_type, ++ customized_type_value, Functor()); + + constexpr auto size = + std::tuple_size>::value; +@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorEx, \ + paddle::framework::EmptyGradOpMaker) + ++#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) + /** + * Macro to register OperatorKernel. + */ + #define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \ ++ place_class, customized_name, \ ++ customized_type_value, ...) ++ ++#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(op_type, library_type, \ + place_class, customized_name, \ + customized_type_value, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ +@@ -311,18 +312,22 @@ struct OpKernelRegistrarFunctorEx CreateTensorRTPredictor( ++ const AnalysisConfig& config); ++} ++ + namespace paddle_infer { + + using PrecisionType = paddle::AnalysisConfig::Precision; diff --git a/tools/infer_prune_patches/phi_cmake.patch b/tools/infer_prune_patches/phi_cmake.patch new file mode 100644 index 0000000000000..2eba0e0c14c6a --- /dev/null +++ b/tools/infer_prune_patches/phi_cmake.patch @@ -0,0 +1,13 @@ +diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt +index 58ad42ddd1..8ffdafcf0d 100644 +--- a/paddle/phi/CMakeLists.txt ++++ b/paddle/phi/CMakeLists.txt +@@ -18,7 +18,7 @@ add_subdirectory(infermeta) + # phi operator definitions + add_subdirectory(ops) + # phi tools +-add_subdirectory(tools) ++#add_subdirectory(tools) + # phi tests + add_subdirectory(tests) + diff --git a/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch new file mode 100644 index 0000000000000..307f12ee97182 --- /dev/null +++ b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch @@ -0,0 +1,68 @@ +diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +index 394ce7799e..8edbef50be 100644 +--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc ++++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +@@ -390,6 +390,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + graph->Has(framework::ir::kEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass)); + ++ std::unordered_set param_set(params.begin(), params.end()); + if (use_static_engine) { + trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); +@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); ++ const auto* root_scope{param_scope()}; ++ for (;root_scope->parent();) { ++ root_scope = root_scope->parent(); ++ } ++ for (const auto& name: param_set) { ++ LOG(INFO) << " ===== Clear param: " << name; ++ root_scope->FindLocalVar(name)->Clear(); ++ } ++ for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount(); ++ ++dev_id) { ++ memory::Release(platform::CUDAPlace(dev_id)); ++ } ++ memory::Release(platform::CPUPlace()); + return; + } + } +@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); +- std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine); ++ const auto* root_scope{scope}; ++ for (;root_scope->parent();) { ++ root_scope = root_scope->parent(); ++ } ++ VLOG(4) << "root_scope->LocalVarNames().size: " << root_scope->LocalVarNames().size(); ++ for (const auto& name: param_set) { ++ VLOG(4) << " ===== Clear param: " << name; ++ root_scope->FindLocalVar(name)->Clear(); ++ } ++ for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount(); ++ ++dev_id) { ++ memory::Release(platform::CUDAPlace(dev_id)); ++ } ++ memory::Release(platform::CPUPlace()); + + if (use_static_engine) { + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); +@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + } ++ trt_engine_serialized_data.clear(); ++ trt_engine_serialized_data.shrink_to_fit(); + } + + } // namespace analysis diff --git a/tools/infer_prune_patches/thread_local_allocator.cc.patch b/tools/infer_prune_patches/thread_local_allocator.cc.patch new file mode 100644 index 0000000000000..6a4486aae9457 --- /dev/null +++ b/tools/infer_prune_patches/thread_local_allocator.cc.patch @@ -0,0 +1,95 @@ +diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc +index f125670a59..f858a30301 100644 +--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc ++++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc +@@ -13,18 +13,62 @@ + // limitations under the License. + + #include "paddle/fluid/memory/allocation/thread_local_allocator.h" ++#include "paddle/fluid/platform/cuda_device_guard.h" + + namespace paddle { + namespace memory { + namespace allocation { + ++const int MALLOC_ALIGN = 64; ++ ++#define CUDA_CALL(func) \ ++ { \ ++ auto e = (func); \ ++ CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ ++ << "CUDA: " << cudaGetErrorString(e); \ ++ } ++ ++void* DirectAllocator::Alloc(size_t unaligned_size) { ++ if (platform::is_cpu_place(place_)) { ++ size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; ++ char* p = static_cast(std::malloc(offset + unaligned_size)); ++ // Memory checking ++ CHECK(p) << "Error occurred in malloc period: available space is not enough " ++ "for mallocing " ++ << unaligned_size << " bytes."; ++ // Byte alignment ++ void* r = reinterpret_cast(reinterpret_cast(p + offset) & ++ (~(MALLOC_ALIGN - 1))); ++ static_cast(r)[-1] = p; ++ return r; ++ } else if (platform::is_gpu_place(place_)) { ++ int dev_id = place_.GetDeviceId(); ++ platform::CUDADeviceGuard guard(dev_id); ++ void* ptr{}; ++ CUDA_CALL(cudaMalloc(&ptr, unaligned_size)); ++ return ptr; ++ } ++ return nullptr; ++} ++ ++void DirectAllocator::Free(void* ptr) { ++ if (platform::is_cpu_place(place_)) { ++ if (ptr) { ++ std::free(static_cast(ptr)[-1]); ++ } ++ } else if (platform::is_gpu_place(place_)) { ++ int dev_id = place_.GetDeviceId(); ++ platform::CUDADeviceGuard guard(dev_id); ++ CUDA_CALL(cudaFree(ptr)); ++ } ++} ++ ++ ++ + ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p) + : place_(p) { + if (platform::is_gpu_place(place_)) { +- buddy_allocator_.reset(new memory::detail::BuddyAllocator( +- std::unique_ptr( +- new memory::detail::GPUAllocator(place_.device)), +- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); ++ direct_allocator_.reset(new DirectAllocator{place_}); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "Thread local allocator only supports CUDAPlace now.")); +@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool() + + ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { + VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size; +- void* ptr = buddy_allocator_->Alloc(size); ++ void* ptr = direct_allocator_->Alloc(size); + auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_); + tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this()); + return tl_allocation; +@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { + + void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { + VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation; +- buddy_allocator_->Free(allocation->ptr()); ++ direct_allocator_->Free(allocation->ptr()); + delete allocation; + } + + uint64_t ThreadLocalAllocatorImpl::ReleaseImpl() { +- return buddy_allocator_->Release(); ++ return direct_allocator_->Release(); + } + + } // namespace allocation diff --git a/tools/infer_prune_patches/thread_local_allocator.h.patch b/tools/infer_prune_patches/thread_local_allocator.h.patch new file mode 100644 index 0000000000000..a3c24178d2093 --- /dev/null +++ b/tools/infer_prune_patches/thread_local_allocator.h.patch @@ -0,0 +1,30 @@ +diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h +index 654fb3fe7b..44c5dbf87f 100644 +--- a/paddle/fluid/memory/allocation/thread_local_allocator.h ++++ b/paddle/fluid/memory/allocation/thread_local_allocator.h +@@ -26,6 +26,16 @@ namespace paddle { + namespace memory { + namespace allocation { + ++class DirectAllocator { ++public: ++ DirectAllocator(const platform::Place& place) : place_{place} {} ++ void* Alloc(size_t unaligned_size); ++ void Free(void* ptr); ++ uint64_t Release() { return 0;} ++private: ++ platform::Place place_; ++}; ++ + class ThreadLocalAllocatorImpl; + + class ThreadLocalAllocation : public Allocation { +@@ -55,7 +65,7 @@ class ThreadLocalAllocatorImpl + uint64_t ReleaseImpl(); + + private: +- std::unique_ptr buddy_allocator_; ++ std::unique_ptr direct_allocator_; + platform::Place place_; + }; + diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py new file mode 100644 index 0000000000000..d53b21d6c3723 --- /dev/null +++ b/tools/prune_for_jetson.py @@ -0,0 +1,190 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script simply removes all grad ops and kernels. You should use this script +when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library. +""" + +import os +import sys +import re +import glob +import io + + +def find_type_files(cur_dir, file_type, file_list=[]): + next_level_dirs = os.listdir(cur_dir) + for next_level_name in next_level_dirs: + next_level_dir = os.path.join(cur_dir, next_level_name) + if os.path.isfile(next_level_dir): + if os.path.splitext(next_level_dir)[1] == file_type: + file_list.append(next_level_dir) + elif os.path.isdir(next_level_dir): + find_type_files(next_level_dir, file_type, file_list) + return file_list + + +def find_kernel(content, pattern): + res = re.findall(pattern, content, flags=re.DOTALL) + ret = [] + for p in res: + left, right = 0, 0 + for c in p: + if c == '{': + left += 1 + elif c == '}': + right += 1 + + if left == right: + ret.append(p) + + return ret, len(ret) + + +def prune_phi_kernels(): + tool_dir = os.path.dirname(os.path.abspath(__file__)) + if sys.version_info[0] == 3: + all_op = glob.glob(os.path.join(tool_dir, + '../paddle/phi/kernels/**/*.cc'), + recursive=True) + all_op += glob.glob(os.path.join(tool_dir, + '../paddle/phi/kernels/**/*.cu'), + recursive=True) + elif sys.version_info[0] == 2: + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/phi/kernels/'), '.cc') + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/phi/kernels/'), '.cu', all_op) + + register_op_count = 0 + for op_file in all_op: + need_continue = False + file_blacklist = [ + "kernels/empty_kernel.cc", "/cast_kernel.c", "/batch_norm_kernel.c" + ] + for bname in file_blacklist: + if op_file.find(bname) >= 0: + need_continue = True + break + + if need_continue: + print("continue:", op_file) + continue + + op_name = os.path.split(op_file)[1] + all_matches = [] + with io.open(op_file, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + op_pattern = 'PD_REGISTER_KERNEL\(.*?\).*?\{.*?\}' + op, op_count = find_kernel(content, op_pattern) + register_op_count += op_count + all_matches.extend(op) + + for p in all_matches: + content = content.replace(p, '') + + with io.open(op_file, 'w', encoding='utf-8') as f: + f.write(u'{}'.format(content)) + + print('We erase all grad op and kernel for Paddle-Inference lib.') + print('%50s%10s' % ('type', 'count')) + print('%50s%10s' % ('REGISTER_OPERATOR', register_op_count)) + return True + + +def apply_patches(): + work_path = os.path.dirname(os.path.abspath(__file__)) + "/../" + ret = os.system( + "cd %s && rm -f paddle/fluid/inference/api/tensorrt_predictor.* " + " && rm -f paddle/fluid/inference/api/paddle_tensorrt_predictor.h " + " && git apply tools/infer_prune_patches/*.patch && cd -" % work_path) + return ret == 0 + + +def append_fluid_kernels(): + op_white_list = ["load", "load_combine"] + + #1. add to makefile + file_name = os.path.dirname(os.path.abspath(__file__)) \ + + "/../paddle/fluid/inference/tensorrt/CMakeLists.txt" + append_str = "\nfile(APPEND ${pybind_file} \"USE_NO_KERNEL_OP__(tensorrt_engine);\\n\")\n" + for op in op_white_list: + append_str = append_str + "file(APPEND ${pybind_file} \"USE_OP__(%s);\\n\")\n" % op + + with io.open(file_name, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + + location_str = "nv_library(\n tensorrt_op_teller\n SRCS op_teller.cc\n DEPS framework_proto device_context boost)" + new_content = content.replace(location_str, location_str + append_str) + + if new_content == content: + print("ERROR: can not find \"%s\" in file \"%s\"" % + (location_str, file_name)) + return False + + with io.open(file_name, 'w', encoding='utf-8') as f: + f.write(u'{}'.format(new_content)) + + #2. add op and kernel register + op_white_list.append("tensorrt_engine") + tool_dir = os.path.dirname(os.path.abspath(__file__)) + if sys.version_info[0] == 3: + all_op = glob.glob(os.path.join(tool_dir, + '../paddle/fluid/operators/**/*.cc'), + recursive=True) + all_op += glob.glob(os.path.join(tool_dir, + '../paddle/fluid/operators/**/*.cu'), + recursive=True) + elif sys.version_info[0] == 2: + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cc') + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cu', all_op) + + for op_file in all_op: + with io.open(op_file, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + + for op in op_white_list: + patterns = { + "REGISTER_OPERATOR": "REGISTER_OPERATOR\(\s*%s\s*," % op, + "REGISTER_OP_CPU_KERNEL": + "REGISTER_OP_CPU_KERNEL\(\s*%s\s*," % op, + "REGISTER_OP_CUDA_KERNEL": + "REGISTER_OP_CUDA_KERNEL\(\s*%s\s*," % op + } + for k, p in patterns.items(): + matches = re.findall(p, content, flags=re.DOTALL) + if len(matches) > 0: + content = content.replace(matches[0], + matches[0].replace(k, k + "__")) + with io.open(op_file, 'w', encoding='utf-8') as f: + f.write(u'{}'.format(content)) + + return True + + +if __name__ == '__main__': + + print("================ step 1: apply patches =======================") + assert (apply_patches()) + print("==============================================================\n") + + print("================ step 2: append fluid op/kernels==============") + assert (append_fluid_kernels()) + print("==============================================================\n") + + print("================ step 3:prune phi kernels ====================") + assert (prune_phi_kernels()) + print("==============================================================\n") diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 44dc4eac26118..bedd44c06d506 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -85,6 +85,7 @@ disable_win_inference_api_test="^trt_quant_int8_yolov3_r50_test$|\ ^lite_resnet50_test$|\ ^test_trt_dynamic_shape_transformer_prune$|\ ^lite_mul_model_test$|\ +^trt_split_converter_test$|\ ^paddle_infer_api_copy_tensor_tester$" @@ -191,10 +192,6 @@ if [ -f "$PADDLE_ROOT/added_ut" ];then echo "========================================" exit 8; fi - if nvcc --version | grep 11.2; then - echo "Only test added_ut temporarily when running in CI-Windows-inference of CUDA 11.2." - exit 0; - fi fi set -e