Skip to content

Commit

Permalink
merge upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
zkh2016 committed Apr 22, 2022
2 parents 9fc2800 + 8a6456d commit a5cc93f
Show file tree
Hide file tree
Showing 106 changed files with 2,824 additions and 681 deletions.
6 changes: 5 additions & 1 deletion cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,11 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
set(cuda_arch_bin "80")
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
set(cuda_arch_bin "80")
elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
set(cuda_arch_bin "80 86")
endif()
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/cinn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ add_definitions(-w)
######################################
include(ExternalProject)
set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
set(CINN_GIT_TAG 08d7680dd91dfaa65787969050eb8f1143654f10)
set(CINN_GIT_TAG release/v0.2)
set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION}
-DWITH_CUDA=${WITH_GPU}
-DWITH_CUDNN=${WITH_GPU}
Expand Down
2 changes: 1 addition & 1 deletion cmake/external/lite.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)

if(NOT LITE_GIT_TAG)
set(LITE_GIT_TAG 4ab64daecc11fbf74fffdc6a4733f388472e7d5d)
set(LITE_GIT_TAG 81ef66554099800c143a0feff6e0a491b3b0d12e)
endif()

if(NOT CUDA_ARCH_NAME)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ def FindParsingFunctionFromAttributeType(atype):
{}
tstate = PyEval_SaveThread();
// Set Device ID
{}
auto out = {}({});
Expand All @@ -118,6 +121,19 @@ def FindParsingFunctionFromAttributeType(atype):
"""

FUNCTION_SET_DEVICE_TEMPLATE = \
"""
{}
if (paddle::platform::is_gpu_place(place)) {{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
phi::backends::gpu::SetDeviceId(place.device);
VLOG(1) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
#else
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU if use CUDAPlace."));
#endif
}}
"""

FUNCTION_NAME_TEMPLATE = \
"{}{}{}"
Expand Down Expand Up @@ -293,14 +309,23 @@ def GeneratePythonCFunction(self):
"false")

parse_attributes_str = ""
expected_place_str = "auto place = egr::Controller::Instance().GetExpectedPlace();\n"

# Generate Python-C Attributes Parsing Logic
for name, atype, _, pos in orig_forward_attrs_list:
parsing_function_name = FindParsingFunctionFromAttributeType(atype)
# Used input argument place if specified from Python frontend.
if len(expected_place_str
) != 0 and parsing_function_name == "CastPyArg2Place":
expected_place_str = ""
assert name == "place", "Only support 'place' as template argument name in FUNCTION_SET_DEVICE_TEMPLATE."

parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format(
name, pos, atype, name, parsing_function_name, name,
forward_api_name, pos)

set_device_str = FUNCTION_SET_DEVICE_TEMPLATE.format(expected_place_str)

# Generate Dygraph Function Call Logic
num_args = len(forward_inputs_position_map.keys()) + len(
orig_forward_attrs_list)
Expand All @@ -326,8 +351,8 @@ def GeneratePythonCFunction(self):
"pythonc_record_event", forward_api_name, "pybind_imperative_func")
self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
forward_api_name, pythonc_record_event_str, forward_api_name,
get_eager_tensor_str, parse_attributes_str, fwd_function_name,
dygraph_function_call_str, return_str)
get_eager_tensor_str, parse_attributes_str, set_device_str,
fwd_function_name, dygraph_function_call_str, return_str)

# Set prefix of forward_api_name to avoid conflicts
prefix = self.namespace.strip("::")
Expand Down Expand Up @@ -361,8 +386,9 @@ def GeneratePythonCFunction(self):
self.python_c_function_str += PYTHON_C_FUNCTION_TEMPLATE.format(
inplaced_forward_api_name, pythonc_record_event_str,
inplaced_forward_api_name, get_eager_tensor_str,
parse_attributes_str, inplaced_fwd_function_name,
dygraph_function_call_str, return_str)
parse_attributes_str, set_device_str,
inplaced_fwd_function_name, dygraph_function_call_str,
return_str)

# Generate Python-C Function Registration
self.python_c_function_reg_str += "\n," + PYTHON_C_FUNCTION_REG_TEMPLATE.format(
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/eager/pylayer/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
cc_library(py_layer_node SRCS py_layer_node.cc DEPS phi phi_api grad_node_info)
cc_library(py_layer_node SRCS py_layer_node.cc DEPS pybind phi phi_api grad_node_info)
14 changes: 14 additions & 0 deletions paddle/fluid/eager/tensor_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,20 @@ class TensorWrapper {
if (full_reserved_) {
VLOG(6) << "Fully reserved tensor: " << tensor.name();
intermidiate_tensor_ = tensor;
if (no_need_buffer_) {
if (phi::DenseTensor::classof(tensor.impl().get())) {
// Only Copy Meta
phi::DenseTensor* dense_tensor =
static_cast<phi::DenseTensor*>(tensor.impl().get());
auto tw_dense_tensor =
std::make_shared<phi::DenseTensor>(*dense_tensor);
tw_dense_tensor->clear();
intermidiate_tensor_.set_impl(tw_dense_tensor);
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Unrecognized tensor type for no_need_buffer feature"));
}
}
return;
}

Expand Down
37 changes: 33 additions & 4 deletions paddle/fluid/framework/fleet/heter_ps/hashtable.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ limitations under the License. */
#include "xpu/kernel/simd.h"
#endif

#if defined(PADDLE_WITH_XPU_KP)
#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
#endif

namespace paddle {
namespace framework {

Expand All @@ -56,11 +60,10 @@ class TableContainer
capacity, ValType()) {}
};
#elif defined(PADDLE_WITH_XPU_KP)

template <typename KeyType, typename ValType>
class XPUCacheArray {
public:
explicit XPUCacheArray(size_t capacity) : capacity_(capacity), size_(0) {
explicit XPUCacheArray(long long capacity) : capacity_(capacity), size_(0) {
xpu_malloc(reinterpret_cast<void**>(&keys), capacity_ * sizeof(KeyType));
xpu_malloc(reinterpret_cast<void**>(&vals), capacity_ * sizeof(ValType));
}
Expand All @@ -71,8 +74,27 @@ class XPUCacheArray {
}

void print() {}
// ValType* find(const KeyType& key) { return NULL; }
// bool insert(const KeyType& key, const ValType& val) { return true; }

#if defined(__xpu__)
__device__ ValType* find(const KeyType& key) {
for (int i = 0; i < size_; i++) {
if (keys[i] == key) return &vals[i];
}
return NULL;
}
__device__ bool insert(const KeyType& key, const ValType& val) {
// # NOTE(zhangminxu): we set the capacity larger than the feasign number of
// one batch
if (size_ == capacity_) {
return false;
} else {
keys[size_] = key;
vals[size_] = val;
size_++;
return true;
}
}
#endif

int prefetch(const int dev_id, XPUStream stream = NULL) { return 0; }
size_t size() { return size_; }
Expand Down Expand Up @@ -110,6 +132,11 @@ class HashTable {

void show();

#if defined(PADDLE_WITH_XPU_KP)
void set_sparse_sgd(const OptimizerConfig& optimizer_config);
void set_embedx_sgd(const OptimizerConfig& optimizer_config);
#endif

template <typename StreamType>
void dump_to_cpu(int devid, StreamType stream);

Expand Down Expand Up @@ -151,6 +178,8 @@ class HashTable {
TableContainer<KeyType, ValType>* container_;
#elif defined(PADDLE_WITH_XPU_KP)
XPUCacheArray<KeyType, ValType>* container_;
OptimizerConfig* xpu_optimizer_config_;
OptimizerConfig cpu_optimizer_config_;
#endif
int BLOCK_SIZE_{256};
float LOAD_FACTOR{0.75f};
Expand Down

0 comments on commit a5cc93f

Please sign in to comment.