Merge branch 'develop' into move_incubate

PaddlePaddle · Aug 26, 2022 · d64615e · d64615e
2 parents 8e023dc + 1f1a783
commit d64615e
Show file tree

Hide file tree

Showing 637 changed files with 18,738 additions and 10,095 deletions.
diff --git a/.gitignore b/.gitignore
@@ -66,14 +66,14 @@ paddle/infrt/dialect/pd/common/pd_ops_info.h
 paddle/infrt/tests/dialect/Output
 paddle/infrt/tests/lit.cfg.py
 paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
-paddle/fluid/pybind/eager_final_state_op_function.cc
+paddle/fluid/pybind/eager_op_function.cc
 
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
 paddle/phi/ops/compat/generated_sig.cc
 paddle/phi/api/yaml/parsed_apis/
 python/paddle/utils/code_gen/
-paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h
-paddle/fluid/pybind/eager_final_state_op_function_impl.h
+paddle/fluid/pybind/tmp_eager_op_function_impl.h
+paddle/fluid/pybind/eager_op_function_impl.h
 paddle/fluid/pybind/eager_op_function_impl.h
 paddle/fluid/pybind/op_function_impl.h
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ Now our developers can acquire Tesla V100 online computing resources for free. I
 
 - **High-Performance Inference Engines for Comprehensive Deployment Environments**
 
-   PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
+   PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/master/guides/introduction/index_intro.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks.
 
 
 - **Industry-Oriented Models and Libraries with Open Source Repositories**

diff --git a/README_cn.md b/README_cn.md
@@ -49,7 +49,7 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 
 - **支持多端多平台的高性能推理部署工具**
 
-    飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
+    飞桨不仅广泛兼容第三方开源框架训练的模型部署，并且为不同的场景的生产环境提供了完备的推理引擎，包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://www.paddlepaddle.org.cn/inference/product_introduction/inference_intro.html)，面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving)，针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite)，以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时，透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。
 
 
 - **面向产业应用，开源开放覆盖多领域的工业级模型库。**

diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
@@ -45,9 +45,8 @@ set(prefix_path
 ExternalProject_Add(
   extern_brpc
   ${EXTERNAL_PROJECT_LOG_ARGS}
-  # TODO(gongwb): change to de newst repo when they changed
-  GIT_REPOSITORY "https://github.com/wangjiawei04/brpc"
-  GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e"
+  GIT_REPOSITORY "https://github.com/apache/incubator-brpc"
+  GIT_TAG 1.2.0
   PREFIX ${BRPC_PREFIX_DIR}
   UPDATE_COMMAND ""
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -60,8 +59,8 @@ ExternalProject_Add(
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
              -DCMAKE_PREFIX_PATH=${prefix_path}
              -DWITH_GLOG=ON
-             -DIOBUF_WITH_HUGE_BLOCK=ON
-             -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
+             -DBUILD_BRPC_TOOLS=ON
+             -DBUILD_SHARED_LIBS=ON
              ${EXTERNAL_OPTIONAL_ARGS}
   LIST_SEPARATOR |
   CMAKE_CACHE_ARGS

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -25,7 +25,7 @@ else()
 endif()
 
 set(XPU_XCCL_BASE_URL
-    "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.4")
+    "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.0")
 
 if(WITH_AARCH64)
   set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -21,6 +21,12 @@ if(WITH_NCCL OR WITH_RCCL)
     DEPS processgroup place enforce collective_helper device_context
          dense_tensor)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
     cc_library(
       processgroup_heter
       SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc
@@ -40,6 +46,13 @@ if(WITH_ASCEND_CL)
          phi_api
          eager_api)
   if(WITH_DISTRIBUTE AND WITH_PSCORE)
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+      set_source_files_properties(
+        ProcessGroupHeter.cc PROPERTIES COMPILE_FLAGS
+                                        ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+
     cc_library(
       processgroup_heter
       SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc

diff --git a/paddle/fluid/distributed/collective/Common.cc b/paddle/fluid/distributed/collective/Common.cc
@@ -47,5 +47,14 @@ bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors) {
       });
 }
 
+bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
+                               const std::string& dev_type) {
+  return std::all_of(
+      tensors.cbegin(), tensors.cend(), [&](const phi::DenseTensor& t) {
+        return platform::places_are_same_class(
+            t.place(), paddle::platform::CustomPlace(dev_type));
+      });
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h
@@ -28,5 +28,8 @@ std::string GetKeyFromPlaces(const std::vector<Place>& places);
 
 bool CheckTensorsInCudaPlace(const std::vector<phi::DenseTensor>& tensors);
 
+bool CheckTensorsInCustomPlace(const std::vector<phi::DenseTensor>& tensors,
+                               const std::string& dev_type);
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
@@ -207,10 +207,111 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Collective(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllGather(
+            device_type_,
+            input.data(),
+            output.data(),
+            input.numel(),
+            phi::ccl::ToCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+
+void* XcclGetPointerByOffset(void* raw_pointer,
+                             size_t offset,
+                             experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in xccl is not supported."));
+  }
+  return nullptr;
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int offset,
+    int length) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          phi::ccl::CCLComm comm,
+          const phi::stream::Stream& stream) {
+        return phi::DeviceManager::CCLAllGather(
+            device_type_,
+            XcclGetPointerByOffset(input.data(), offset, input.dtype()),
+            output.data(),
+            length,
+            phi::ccl::ToCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllReduce(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
     const AllreduceOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
   return Collective(
       in_tensors,
       out_tensors,
@@ -235,6 +336,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::Broadcast(
     std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
     const BroadcastOptions& opts) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(in_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All inputs should be in CustomPlace(%s).", device_type_));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCustomPlace(out_tensors, device_type_),
+      true,
+      platform::errors::InvalidArgument(
+          "All outputs should be in CustomPlace(%s).", device_type_));
   return Collective(
       in_tensors,
       out_tensors,

diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
@@ -73,6 +73,16 @@ class ProcessGroupCustom : public ProcessGroup {
     return "XCCL_" + device_type_;
   }
 
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int offset,
+      int length) override;
+
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,

diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -41,6 +41,20 @@ TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
   task_id_ = task_node_cnt++;
 }
 
+TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
+                   int64_t rank,
+                   int64_t task_id,
+                   int64_t max_run_times,
+                   int64_t max_slot_nums)
+    : program_(program),
+      rank_(rank),
+      task_id_(task_id),
+      max_run_times_(max_run_times),
+      max_slot_nums_(max_slot_nums) {
+  // TODO(liyurui): Will be removed when execute program is supported.
+  Init();
+}
+
 TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank)
     : program_(program), rank_(rank), task_id_(rank) {
   max_run_times_ = 1;

diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -55,6 +55,12 @@ class TaskNode final {
            int64_t max_run_times,
            int64_t max_slot_nums);
   TaskNode(paddle::framework::ProgramDesc* program, int64_t rank);
+  // TODO(liyurui): This will be the only constructor for task node
+  TaskNode(paddle::framework::ProgramDesc* program,
+           int64_t task_id,
+           int64_t rank,
+           int64_t max_run_times,
+           int64_t max_slot_nums);
   ~TaskNode() = default;
 
   void SetProgram(paddle::framework::ProgramDesc* program);

diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -100,7 +100,9 @@ class HeterClient {
       options.connection_type = "";
       VLOG(4) << "ssl enabled in arm";
 #else
-      options.ssl_options.enable = need_encrypt;
+      if (need_encrypt) {
+        options.mutable_ssl_options();
+      }
 #endif
       client_channels = &peer_switch_channels_;
     } else if (peer_role == PEER_ROLE_IS_WORKER) {

diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -32,13 +32,8 @@ void HeterServer::StartHeterService(bool neeed_encrypt) {
   server_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
   brpc::ServerOptions options;
   if (neeed_encrypt) {
-#ifdef PADDLE_WITH_ARM_BRPC
     options.mutable_ssl_options()->default_cert.certificate = "/cert.pem";
     options.mutable_ssl_options()->default_cert.private_key = "/key.pem";
-#else
-    options.ssl_options.default_cert.certificate = "/cert.pem";
-    options.ssl_options.default_cert.private_key = "/key.pem";
-#endif
   }
   if (server_.Start(endpoint_.c_str(), &options) != 0) {
     VLOG(0) << "HeterServer start fail. Try again.";
@@ -72,13 +67,8 @@ void HeterServer::StartHeterInterService(bool neeed_encrypt) {
   server_inter_.AddService(&service_, brpc::SERVER_DOESNT_OWN_SERVICE);
   brpc::ServerOptions options;
   if (neeed_encrypt) {
-#ifdef PADDLE_WITH_ARM_BRPC
     options.mutable_ssl_options()->default_cert.certificate = "/cert.pem";
     options.mutable_ssl_options()->default_cert.private_key = "/key.pem";
-#else
-    options.ssl_options.default_cert.certificate = "/cert.pem";
-    options.ssl_options.default_cert.private_key = "/key.pem";
-#endif
   }
   if (server_inter_.Start(endpoint_inter_.c_str(), &options) != 0) {
     VLOG(4) << "switch inter server start fail. Try again.";

diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
@@ -194,8 +194,8 @@ void MasterDaemon::ProcessCommands(std::vector<struct pollfd>* p_fds) {
                        << " from addr info:" << GetSockName(fds[i].fd);
       }
     } catch (const std::exception& ex) {
-      fds.erase(fds.begin() + i);
       tcputils::close_socket(fds[i].fd);
+      fds.erase(fds.begin() + i);
 #ifdef _WIN32
       _sockets.erase(_sockets.begin() + i - 1);
 #else
@@ -405,12 +405,14 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 void TCPStore::wait(const std::string& key) {
   ReplyType reply;
   VLOG(3) << "TCPStore wait.";
-  do {
-    _client->send_command_for_key(Command::WAIT, _key_prefix + key);
+  _client->send_command_for_key(Command::WAIT, _key_prefix + key);
+  reply = _client->receive_value<ReplyType>();
+  while (reply != ReplyType::STOP_WAIT) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(500));
 
+    _client->send_command_for_key(Command::WAIT, _key_prefix + key);
     reply = _client->receive_value<ReplyType>();
-    std::this_thread::sleep_for(std::chrono::milliseconds(500));
-  } while (reply != ReplyType::STOP_WAIT);
+  }
 }
 
 TCPStore::~TCPStore() { VLOG(3) << "TCPStore destructure"; }

diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
@@ -41,7 +41,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
     grad_tensor_holder
     SRCS grad_tensor_holder.cc
     DEPS grad_node_info gradient_accumulator)
-  add_dependencies(grad_tensor_holder eager_final_state_codegen)
+  add_dependencies(grad_tensor_holder eager_codegen)
   cc_library(
     backward
     SRCS backward.cc

diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt
@@ -8,5 +8,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
     final_dygraph_node
     SRCS nodes.cc ${eager_manual_nodes}
     DEPS ${eager_deps})
-  add_dependencies(final_dygraph_node eager_final_state_codegen)
+  add_dependencies(final_dygraph_node eager_codegen)
 endif()
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt
@@ -8,5 +8,5 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
     final_dygraph_function
     SRCS dygraph_functions.cc ${eager_manual_functions}
     DEPS ${eager_deps})
-  add_dependencies(final_dygraph_function eager_final_state_codegen)
+  add_dependencies(final_dygraph_function eager_codegen)
 endif()