merge upstream

PaddlePaddle · Apr 22, 2022 · a5cc93f · a5cc93f
2 parents 9fc2800 + 8a6456d
commit a5cc93f
Show file tree

Hide file tree

Showing 106 changed files with 2,824 additions and 681 deletions.
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -132,7 +132,11 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    set(cuda_arch_bin "80")
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
+      set(cuda_arch_bin "80")
+    elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
+      set(cuda_arch_bin "80 86")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")

diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake
@@ -26,7 +26,7 @@ add_definitions(-w)
 ######################################
 include(ExternalProject)
 set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_GIT_TAG 08d7680dd91dfaa65787969050eb8f1143654f10)
+set(CINN_GIT_TAG release/v0.2)
 set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
                        -DWITH_CUDA=${WITH_GPU}
                        -DWITH_CUDNN=${WITH_GPU}

diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
@@ -50,7 +50,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 4ab64daecc11fbf74fffdc6a4733f388472e7d5d)
+    set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e)
   endif()
 
   if(NOT CUDA_ARCH_NAME)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -100,6 +100,9 @@ def FindParsingFunctionFromAttributeType(atype):
 {}
 
     tstate = PyEval_SaveThread();
+
+    // Set Device ID
+{}
     
     auto out = {}({});
     
@@ -118,6 +121,19 @@ def FindParsingFunctionFromAttributeType(atype):
 
 """
 
+FUNCTION_SET_DEVICE_TEMPLATE = \
+"""
+    {}
+    if (paddle::platform::is_gpu_place(place)) {{
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      phi::backends::gpu::SetDeviceId(place.device);
+      VLOG(1) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
+#else
+      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU if use CUDAPlace."));
+#endif
+    }}
+"""
 
 FUNCTION_NAME_TEMPLATE = \
 "{}{}{}"
@@ -293,14 +309,23 @@ def GeneratePythonCFunction(self):
                         "false")
 
         parse_attributes_str = ""
+        expected_place_str = "auto place = egr::Controller::Instance().GetExpectedPlace();\n"
 
         # Generate Python-C Attributes Parsing Logic
         for name, atype, _, pos in orig_forward_attrs_list:
             parsing_function_name = FindParsingFunctionFromAttributeType(atype)
+            # Used input argument place if specified from Python frontend.
+            if len(expected_place_str
+                   ) != 0 and parsing_function_name == "CastPyArg2Place":
+                expected_place_str = ""
+                assert name == "place", "Only support 'place' as template argument name in FUNCTION_SET_DEVICE_TEMPLATE."
+
             parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
                 name, pos, atype, name, parsing_function_name, name,
                 forward_api_name, pos)
 
+        set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)
+
         # Generate Dygraph Function Call Logic
         num_args = len(forward_inputs_position_map.keys()) + len(
             orig_forward_attrs_list)
@@ -326,8 +351,8 @@ def GeneratePythonCFunction(self):
             "pythonc_record_event", forward_api_name, "pybind_imperative_func")
         self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
             forward_api_name, pythonc_record_event_str, forward_api_name,
-            get_eager_tensor_str, parse_attributes_str, fwd_function_name,
-            dygraph_function_call_str, return_str)
+            get_eager_tensor_str, parse_attributes_str, set_device_str,
+            fwd_function_name, dygraph_function_call_str, return_str)
 
         # Set prefix of forward_api_name to avoid conflicts
         prefix = self.namespace.strip("::")
@@ -361,8 +386,9 @@ def GeneratePythonCFunction(self):
             self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
                 inplaced_forward_api_name, pythonc_record_event_str,
                 inplaced_forward_api_name, get_eager_tensor_str,
-                parse_attributes_str, inplaced_fwd_function_name,
-                dygraph_function_call_str, return_str)
+                parse_attributes_str, set_device_str,
+                inplaced_fwd_function_name, dygraph_function_call_str,
+                return_str)
 
             # Generate Python-C Function Registration
             self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(

diff --git a/paddle/fluid/eager/pylayer/CMakeLists.txt b/paddle/fluid/eager/pylayer/CMakeLists.txt
@@ -1 +1 @@
-cc_library(py_layer_node SRCS py_layer_node.cc DEPS phi phi_api grad_node_info)
+cc_library(py_layer_node SRCS py_layer_node.cc DEPS pybind phi phi_api grad_node_info)
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
@@ -55,6 +55,20 @@ class TensorWrapper {
     if (full_reserved_) {
       VLOG(6) << "Fully reserved tensor: " << tensor.name();
       intermidiate_tensor_ = tensor;
+      if (no_need_buffer_) {
+        if (phi::DenseTensor::classof(tensor.impl().get())) {
+          // Only Copy Meta
+          phi::DenseTensor* dense_tensor =
+              static_cast<phi::DenseTensor*>(tensor.impl().get());
+          auto tw_dense_tensor =
+              std::make_shared<phi::DenseTensor>(*dense_tensor);
+          tw_dense_tensor->clear();
+          intermidiate_tensor_.set_impl(tw_dense_tensor);
+        } else {
+          PADDLE_THROW(paddle::platform::errors::Fatal(
+              "Unrecognized tensor type for no_need_buffer feature"));
+        }
+      }
       return;
     }
 

diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -41,6 +41,10 @@ limitations under the License. */
 #include "xpu/kernel/simd.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_KP)
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -56,11 +60,10 @@ class TableContainer
             capacity, ValType()) {}
 };
 #elif defined(PADDLE_WITH_XPU_KP)
-
 template <typename KeyType, typename ValType>
 class XPUCacheArray {
  public:
-  explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) {
+  explicit XPUCacheArray(long long capacity) : capacity_(capacity), size_(0) {
     xpu_malloc(reinterpret_cast<void**>(&keys), capacity_ * sizeof(KeyType));
     xpu_malloc(reinterpret_cast<void**>(&vals), capacity_ * sizeof(ValType));
   }
@@ -71,8 +74,27 @@ class XPUCacheArray {
   }
 
   void print() {}
-  // ValType* find(const KeyType& key) { return NULL; }
-  // bool insert(const KeyType& key, const ValType& val) { return true; }
+
+#if defined(__xpu__)
+  __device__ ValType* find(const KeyType& key) {
+    for (int i = 0; i < size_; i++) {
+      if (keys[i] == key) return &vals[i];
+    }
+    return NULL;
+  }
+  __device__ bool insert(const KeyType& key, const ValType& val) {
+    // # NOTE(zhangminxu): we set the capacity larger than the feasign number of
+    // one batch
+    if (size_ == capacity_) {
+      return false;
+    } else {
+      keys[size_] = key;
+      vals[size_] = val;
+      size_++;
+      return true;
+    }
+  }
+#endif
 
   int prefetch(const int dev_id, XPUStream stream = NULL) { return 0; }
   size_t size() { return size_; }
@@ -110,6 +132,11 @@ class HashTable {
 
   void show();
 
+#if defined(PADDLE_WITH_XPU_KP)
+  void set_sparse_sgd(const OptimizerConfig& optimizer_config);
+  void set_embedx_sgd(const OptimizerConfig& optimizer_config);
+#endif
+
   template <typename StreamType>
   void dump_to_cpu(int devid, StreamType stream);
 
@@ -151,6 +178,8 @@ class HashTable {
   TableContainer<KeyType, ValType>* container_;
 #elif defined(PADDLE_WITH_XPU_KP)
   XPUCacheArray<KeyType, ValType>* container_;
+  OptimizerConfig* xpu_optimizer_config_;
+  OptimizerConfig cpu_optimizer_config_;
 #endif
   int BLOCK_SIZE_{256};
   float LOAD_FACTOR{0.75f};