Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into logcumsumexp
Browse files Browse the repository at this point in the history
  • Loading branch information
tiancaishaonvjituizi committed May 9, 2022
2 parents 8c680e6 + d878f97 commit 442bc00
Show file tree
Hide file tree
Showing 296 changed files with 8,135 additions and 7,200 deletions.
1 change: 1 addition & 0 deletions AUTHORS.md
Expand Up @@ -57,6 +57,7 @@
| reyoung | Yang Yu |
| [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus |
| [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek |
| Silv3S | Slawomir Siwek |
| sneaxiy | Jin-Le Zeng |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
Expand Down
13 changes: 11 additions & 2 deletions CMakeLists.txt
Expand Up @@ -100,7 +100,11 @@ if(APPLE AND WITH_ARM)
endif()

if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
if(WITH_ARM_BRPC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
endif()
endif()

if(WIN32)
Expand Down Expand Up @@ -386,11 +390,16 @@ if(WITH_DISTRIBUTE)
if(LINUX)
set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
endif()
if(WITH_ASCEND_CL)
if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
# disable WITH_PSCORE for NPU before include third_party
MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
endif()
if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
# TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
MESSAGE(WARNING "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496. Force WITH_PSCORE=OFF.")
set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496" FORCE)
endif()
endif()

include(third_party) # download, build, install third_party, Contains about 20+ dependencies
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -49,7 +49,7 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
[Click here to learn more](https://github.com/PaddlePaddle/Fleet)


- **High-Performance Inference Engines for Comprehensive Deployment Enviroments**
- **High-Performance Inference Engines for Comprehensive Deployment Environments**

PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.

Expand Down
2 changes: 1 addition & 1 deletion cmake/coverallsGcovJsons.cmake
Expand Up @@ -238,7 +238,7 @@ foreach (GCOV_FILE ${GCOV_FILES})
message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")

# Loads the gcov file as a list of lines.
# (We first open the file and replace all occurences of [] with _
# (We first open the file and replace all occurrences of [] with _
# because CMake will fail to parse a line containing unmatched brackets...
# also the \ to escaped \n in macros screws up things.)
# https://public.kitware.com/Bug/view.php?id=15369
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Expand Up @@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME "libxpurt.so")

if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220411")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
Expand Down
4 changes: 4 additions & 0 deletions cmake/flags.cmake
Expand Up @@ -158,6 +158,10 @@ if(WITH_IPU)
)
endif()

if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
endif()

if(NOT APPLE)
if((${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.0) OR (WITH_ROCM))
set(COMMON_FLAGS
Expand Down
29 changes: 29 additions & 0 deletions cmake/hip.cmake
Expand Up @@ -18,6 +18,33 @@ include_directories(${ROCM_PATH}/include)
message(STATUS "HIP version: ${HIP_VERSION}")
message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}")

macro(find_hip_version hip_header_file)
file(READ ${hip_header_file} HIP_VERSION_FILE_CONTENTS)

string(REGEX MATCH "define HIP_VERSION_MAJOR +([0-9]+)" HIP_MAJOR_VERSION
"${HIP_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define HIP_VERSION_MAJOR +([0-9]+)" "\\1"
HIP_MAJOR_VERSION "${HIP_MAJOR_VERSION}")
string(REGEX MATCH "define HIP_VERSION_MINOR +([0-9]+)" HIP_MINOR_VERSION
"${HIP_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define HIP_VERSION_MINOR +([0-9]+)" "\\1"
HIP_MINOR_VERSION "${HIP_MINOR_VERSION}")
string(REGEX MATCH "define HIP_VERSION_PATCH +([0-9]+)" HIP_PATCH_VERSION
"${HIP_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define HIP_VERSION_PATCH +([0-9]+)" "\\1"
HIP_PATCH_VERSION "${HIP_PATCH_VERSION}")

if(NOT HIP_MAJOR_VERSION)
set(HIP_VERSION "???")
message(WARNING "Cannot find HIP version in ${HIP_PATH}/include/hip/hip_version.h")
else()
math(EXPR HIP_VERSION "${HIP_MAJOR_VERSION} * 10000000 + ${HIP_MINOR_VERSION} * 100000 + ${HIP_PATCH_VERSION}")
message(STATUS "Current HIP header is ${HIP_PATH}/include/hip/hip_version.h "
"Current HIP version is v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}.${HIP_PATCH_VERSION}. ")
endif()
endmacro()
find_hip_version(${HIP_PATH}/include/hip/hip_version.h)

macro(find_package_and_include PACKAGE_NAME)
find_package("${PACKAGE_NAME}" REQUIRED)
include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include")
Expand Down Expand Up @@ -71,8 +98,10 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
# host linker to link.
list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)


if(HIP_COMPILER STREQUAL clang)
Expand Down
2 changes: 1 addition & 1 deletion cmake/inference_lib.cmake
Expand Up @@ -416,7 +416,7 @@ function(version version_file)
endif()
if(WITH_ROCM)
file(APPEND ${version_file}
"HIP version: ${HIP_VERSION}\n"
"HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n"
"MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
endif()
if(WITH_ASCEND_CL)
Expand Down
2 changes: 1 addition & 1 deletion cmake/third_party.cmake
Expand Up @@ -357,7 +357,7 @@ if (WITH_PSCORE)
include(external/libmct) # download, build, install libmct
list(APPEND third_party_deps extern_libmct)

include(external/rocksdb) # download, build, install libmct
include(external/rocksdb) # download, build, install rocksdb
list(APPEND third_party_deps extern_rocksdb)
endif()

Expand Down
106 changes: 104 additions & 2 deletions paddle/fluid/distributed/collective/ProcessGroupHeter.cc 100755 → 100644
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
#include <chrono>
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/api/include/api.h"
Expand All @@ -24,6 +25,8 @@ namespace paddle {
namespace distributed {

using Place = paddle::platform::Place;
int ProcessGroupHeter::send_count = 0;
int ProcessGroupHeter::recv_count = 0;

std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
int rank, CommType comm_type, const std::vector<phi::DenseTensor>& inputs) {
Expand All @@ -47,15 +50,19 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
ProcessGroupHeter::ProcessGroupHeter(
const std::shared_ptr<Store>& store, int rank, int size,
const platform::Place& place, int gid, int local_rank, int local_size,
int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint)
int gloo_rank, int gloo_size, bool with_switch, std::string switch_endpoint,
int src_rank, int dst_rank)
: ProcessGroup(rank, size, place, gid),
store_(store),
local_rank_(local_rank),
local_size_(local_size),
gloo_rank_(gloo_rank),
gloo_size_(gloo_size),
with_switch_(with_switch),
switch_endpoint_(switch_endpoint) {
switch_endpoint_(switch_endpoint),
src_rank_(src_rank),
dst_rank_(dst_rank) {
return;
#if defined(PADDLE_WITH_NCCL)
inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size,
place_, IGNORE_ID);
Expand Down Expand Up @@ -246,5 +253,100 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
return CreateTask(rank_, CommType::BROADCAST, in_tensors);
}

std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Send(
std::vector<phi::DenseTensor>& in_tensors, int peer) {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
#endif

PADDLE_ENFORCE_EQ(
in_tensors.size(), 1,
platform::errors::PreconditionNotMet(
"For each send operation, there can only be one tensor to send."));
// Copy Tensor to cpu
auto start = std::chrono::high_resolution_clock::now();
phi::DenseTensor cpu_tensor;
auto& gpu_tensor = in_tensors[0];
framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor);
PADDLE_ENFORCE_EQ(with_switch_, true,
platform::errors::PreconditionNotMet(
"Gloo does not support the send operation."));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
<< ") from gpu to cpu for send " << std::setw(9)
<< " is: " << diff.count() << " s" << std::endl;

// Send to switch
HeterClient* client_ =
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
int64_t tensor_size =
cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype());
std::vector<int64_t> send_size;
send_size.push_back(tensor_size);
auto id = src_rank_ * 10000 + dst_rank_;
std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
std::string("_") + std::to_string(send_count++);
VLOG(2) << "tensor_name:" << tensor_name;
int ret = client_->Send(gid_, {tensor_name}, send_size, cpu_tensor.data(),
tensor_size);
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
"Send to the switch module error."));
return CreateTask(rank_, CommType::SEND, in_tensors);
}

std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Recv(
std::vector<phi::DenseTensor>& out_tensors, int peer) {
#if defined(PADDLE_WITH_NCCL)
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors), true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
#endif

PADDLE_ENFORCE_EQ(
out_tensors.size(), 1,
platform::errors::PreconditionNotMet(
"For each rece operation, there can only be one tensor to receive."));

// Copy Tensor to cpu
phi::DenseTensor cpu_tensor;
auto& gpu_tensor = out_tensors[0];
cpu_tensor.Resize(gpu_tensor.dims());
cpu_tensor.set_layout(gpu_tensor.layout());
cpu_tensor.mutable_data(platform::CPUPlace(), gpu_tensor.dtype());

PADDLE_ENFORCE_EQ(with_switch_, true,
platform::errors::PreconditionNotMet(
"Gloo does not support the send operation."));
// recv from switch
HeterClient* client_ =
HeterClient::GetInstance({switch_endpoint_}, {}, 0).get();
auto id = src_rank_ * 10000 + dst_rank_;
std::string tensor_name = std::to_string(gid_) + "_id_" + std::to_string(id) +
std::string("_") + std::to_string(recv_count++);
VLOG(2) << "tensor_name: " << tensor_name;
auto start = std::chrono::high_resolution_clock::now();
int ret = client_->Recv(
gid_, {tensor_name}, cpu_tensor.data(),
cpu_tensor.numel() * framework::DataTypeSize(cpu_tensor.dtype()));
PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
"receive to the switch module error."));
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
double goodput = cpu_tensor.numel() *
framework::DataTypeSize(cpu_tensor.dtype()) / diff.count();
VLOG(2) << "Goodput: " << goodput << "B/s" << std::endl;
start = std::chrono::high_resolution_clock::now();
framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor);
end = std::chrono::high_resolution_clock::now();
diff = end - start;
VLOG(2) << "Time to copy tensor of dims(" << cpu_tensor.dims()
<< ") from gpu to cpu for recv " << std::setw(9)
<< " is: " << diff.count() << " s" << std::endl;
return CreateTask(rank_, CommType::RECV, out_tensors);
}

} // namespace distributed
} // namespace paddle
13 changes: 12 additions & 1 deletion paddle/fluid/distributed/collective/ProcessGroupHeter.h
Expand Up @@ -83,7 +83,8 @@ class ProcessGroupHeter : public ProcessGroup {
ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
const platform::Place& place, int gid, int local_rank,
int local_size, int gloo_rank, int gloo_size,
bool with_switch, std::string switch_endpoints);
bool with_switch, std::string switch_endpoints,
int src_rank, int dst_rank);

const std::string GetBackendName() const override {
return std::string(HETER_BACKEND_NAME);
Expand All @@ -97,6 +98,12 @@ class ProcessGroupHeter : public ProcessGroup {
std::vector<phi::DenseTensor>&, std::vector<phi::DenseTensor>&,
const BroadcastOptions& = BroadcastOptions()) override;

std::shared_ptr<ProcessGroup::Task> Send(
std::vector<phi::DenseTensor>& in_tensors, int peer) override;

std::shared_ptr<ProcessGroup::Task> Recv(
std::vector<phi::DenseTensor>& out_tensors, int peer) override;

protected:
virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
int rank, CommType opType, const std::vector<phi::DenseTensor>& inputs);
Expand All @@ -112,6 +119,10 @@ class ProcessGroupHeter : public ProcessGroup {
int gloo_size_;
bool with_switch_;
std::string switch_endpoint_;
int src_rank_;
int dst_rank_;
static int send_count;
static int recv_count;
};

} // namespace distributed
Expand Down

0 comments on commit 442bc00

Please sign in to comment.