Skip to content

Commit

Permalink
Merge branch 'develop' into triplet_margin_loss
Browse files Browse the repository at this point in the history
  • Loading branch information
yangguohao committed Jun 8, 2022
2 parents 234f469 + cab0f2f commit e6aac2b
Show file tree
Hide file tree
Showing 133 changed files with 5,042 additions and 1,123 deletions.
5 changes: 1 addition & 4 deletions cmake/external/lite.cmake
Expand Up @@ -115,7 +115,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
set(LITE_OPTIONAL_ARGS
-DWITH_MKL=ON
-DLITE_WITH_CUDA=${WITH_GPU}
-DLITE_WITH_CUDA=OFF
-DWITH_MKLDNN=OFF
-DLITE_WITH_X86=ON
-DLITE_WITH_PROFILE=OFF
Expand All @@ -124,9 +124,6 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
-DWITH_PYTHON=OFF
-DWITH_TESTING=OFF
-DLITE_BUILD_EXTRA=ON
-DCUDNN_ROOT=${CUDNN_ROOT}
-DLITE_WITH_STATIC_CUDA=OFF
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
-DLITE_WITH_XPU=${LITE_WITH_XPU}
-DXPU_SDK_URL=${XPU_BASE_URL}
-DXPU_SDK_ENV=${XPU_SDK_ENV}
Expand Down
7 changes: 7 additions & 0 deletions paddle/fluid/distributed/collective/reducer.cc
Expand Up @@ -775,6 +775,13 @@ void EagerReducer::ProcessUnusedDenseVars() {
continue;
}

// NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
// gradient accumulation can continue normally after clear_gradients()
// especiall in cases including complex control flow.
std::static_pointer_cast<egr::GradNodeAccumulation>(
GetGradNodeFromTensor(&tensors_[var_index]))
->SetFakeEmpty(false);

Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));

auto dest_var_base = tensors_[var_index];
Expand Down
40 changes: 28 additions & 12 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Expand Up @@ -838,29 +838,42 @@ void PSGPUWrapper::EndPass() {
std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
}
}

auto dump_pool_to_cpu_func = [this](int i, int j) {
int thread_num = 8;
auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
auto& device_keys = this->current_task_->device_dim_keys_[i][j];
size_t len = device_keys.size();
// ====== multi-thread process feasign================
int len_per_thread = len / thread_num;
int remain = len % thread_num;
int left = -1, right = -1;
int real_len = len_per_thread;
if (z < remain) real_len++;
if (z < remain) {
left = z * (len_per_thread + 1);
right = left + real_len;
} else {
left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
right = left + real_len;
}
// ============ multi-thread process feasign============
int mf_dim = this->index_dim_vec_[j];
VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
size_t feature_value_size =
TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));

char* test_build_values = (char*)malloc(feature_value_size * len);
cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
cudaMemcpyDeviceToHost);

char* test_build_values = (char*)malloc(feature_value_size * real_len);
uint64_t offset = left * feature_value_size;
cudaMemcpy(test_build_values, hbm_pool->mem() + offset,
feature_value_size * real_len, cudaMemcpyDeviceToHost);
CHECK(len == hbm_pool->capacity());
uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
for (size_t i = 0; i < len; ++i) {
for (int i = left; i < right; ++i) {
if (device_keys[i] == unuse_key) {
continue;
}
size_t offset = i * feature_value_size;
FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
size_t local_offset = (i - left) * feature_value_size;
FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset);
#ifdef PADDLE_WITH_PSLIB
auto* downpour_value =
(paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
Expand Down Expand Up @@ -912,10 +925,13 @@ void PSGPUWrapper::EndPass() {
if (multi_mf_dim_) {
VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
size_t device_num = heter_devices_.size();
std::vector<std::thread> threads(device_num * multi_mf_dim_);
std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
for (size_t i = 0; i < device_num; i++) {
for (int j = 0; j < multi_mf_dim_; j++) {
threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
for (int k = 0; k < thread_num; k++) {
threads[(i + j * device_num) * thread_num + k] =
std::thread(dump_pool_to_cpu_func, i, j, k);
}
}
}
for (std::thread& t : threads) {
Expand Down
Expand Up @@ -349,9 +349,9 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
waiting_for_scale.insert(input_name);
waiting_for_scale.insert(output_name);
} else if (in_iter != var_quant_scales->end()) {
out_iter->second = in_iter->second;
(*var_quant_scales)[output_name] = in_iter->second;
} else if (out_iter != var_quant_scales->end()) {
in_iter->second = out_iter->second;
(*var_quant_scales)[input_name] = out_iter->second;
}
} else if (op_name == "scale") {
const std::string output_name = op_node->Op()->Output("Out")[0];
Expand Down
Expand Up @@ -38,7 +38,7 @@ void QuantDequantMkldnnPass::MarkSkipQuantizedOps(
for (auto* node_input : op_node->inputs) {
for (auto* node_input_input : node_input->inputs) {
if (!node_input_input->IsOp()) continue;
if (node_input_input->Name().find("quantize_dequantize") ==
if (node_input_input->Name().find("quantize") ==
std::string::npos) {
is_quantized_op = false;
break;
Expand Down
64 changes: 42 additions & 22 deletions paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
Expand Up @@ -235,16 +235,18 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
}

PDNode* TrtMultiHeadMatmulPattern::operator()() {
std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
auto* input0 = pattern->NewNode(input0_repr());
input0->assert_is_op_input("mul");
input0->assert_is_ops_input(mul_ops);

// First path with scale
auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("mul");
auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(mul_ops);
auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
->AsInput()
->assert_is_op_input("mul", "Y");
->assert_is_ops_input(mul_ops, "Y");
auto* mul0_out_var =
pattern->NewNode(mul0_out_repr())->assert_is_op_output("mul");
pattern->NewNode(mul0_out_repr())->assert_is_ops_output(mul_ops);

decltype(mul0) eltadd0;
decltype(mul0) eltadd0_b_var;
Expand Down Expand Up @@ -277,11 +279,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
auto* scale_out_var =
pattern->NewNode(scale_out_repr())->assert_is_op_output("scale");
scale_out_var->AsIntermediate()->assert_is_op_input("matmul");
scale_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);

auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
auto* matmul_qk =
pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
auto* matmul_qk_out_var =
pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");

auto* eltadd_qk =
Expand All @@ -297,12 +300,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
auto* softmax_qk_out_var =
pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matmul");
softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);

auto* matmul_qkv =
pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul");
pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
auto* matmul_qkv_out_var =
pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul");
pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");

auto* transpose2_qkv =
Expand All @@ -315,15 +318,15 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
->assert_is_op_output("reshape2");
reshape2_qkv_out_var->assert_is_op_input("mul");
reshape2_qkv_out_var->assert_is_ops_input(mul_ops);

// Second path to matmul
auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("mul");
auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(mul_ops);
auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
->AsInput()
->assert_is_op_input("mul", "Y");
->assert_is_ops_input(mul_ops, "Y");
auto* mul1_out_var =
pattern->NewNode(mul1_out_repr())->assert_is_op_output("mul");
pattern->NewNode(mul1_out_repr())->assert_is_ops_output(mul_ops);

decltype(mul1) eltadd1;
decltype(mul1) eltadd1_b_var;
Expand All @@ -350,16 +353,16 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
->assert_is_op_output("transpose2");
transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
"matmul"); // link to matmul qk
transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
matmul_ops); // link to matmul qk

// Third path to matmul
auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("mul");
auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(mul_ops);
auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
->AsInput()
->assert_is_op_input("mul", "Y");
->assert_is_ops_input(mul_ops, "Y");
auto* mul2_out_var =
pattern->NewNode(mul2_out_repr())->assert_is_op_output("mul");
pattern->NewNode(mul2_out_repr())->assert_is_ops_output(mul_ops);

decltype(mul2) eltadd2;
decltype(mul2) eltadd2_b_var;
Expand All @@ -386,8 +389,8 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
->assert_is_op_output("transpose2");
transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
"matmul"); // link to matmul qkv
transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
matmul_ops); // link to matmul qkv

// Q path
mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
Expand Down Expand Up @@ -734,6 +737,23 @@ TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
.IsType<bool>()
.End();

AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsType<bool>()
.End()
.AddAttr("trans_y")
.IsType<bool>()
.End();

AddOpCompat(OpCompat("softmax"))
.AddInput("X")
.IsTensor()
Expand Down Expand Up @@ -866,7 +886,7 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
auto* mul0_op_desc = mul0->Op();

// all mul op has same input.
if (multihead_op_desc.HasAttr("Input_scale")) {
if (mul0_op_desc->HasAttr("Input_scale")) {
multihead_op_desc.SetAttr("Input_scale",
mul0_op_desc->GetAttr("Input_scale"));
}
Expand Down
24 changes: 24 additions & 0 deletions paddle/fluid/inference/api/analysis_config.cc
Expand Up @@ -100,6 +100,24 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
Update();
}

void AnalysisConfig::SetExecStream(void *stream) {
PADDLE_ENFORCE_NOT_NULL(stream, platform::errors::InvalidArgument(
"`stream` should not be nullptr"));
exec_stream_ = stream;
use_external_stream_ = true;
Update();
}

void *AnalysisConfig::GetExecStream() const {
PADDLE_ENFORCE_NOT_NULL(exec_stream_, platform::errors::InvalidArgument(
"`stream` should not be nullptr"));
return exec_stream_;
}

bool AnalysisConfig::external_stream_enabled() const {
return use_external_stream_;
}

void AnalysisConfig::DisableGpu() {
use_gpu_ = false;

Expand Down Expand Up @@ -239,6 +257,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(use_fc_padding_);
// GPU related.
CP_MEMBER(use_gpu_);
CP_MEMBER(use_external_stream_);
CP_MEMBER(exec_stream_);
CP_MEMBER(use_cudnn_);
CP_MEMBER(gpu_device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
Expand Down Expand Up @@ -787,6 +807,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss << params_file_;

ss << use_gpu_;
ss << use_external_stream_;
ss << exec_stream_;
ss << use_gpu_fp16_;
for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
ss << use_fc_padding_;
Expand Down Expand Up @@ -985,6 +1007,8 @@ std::string AnalysisConfig::Summary() {
os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)});
os.InsertRow({"memory_pool_init_size",
std::to_string(memory_pool_init_size_mb_) + "MB"});
os.InsertRow(
{"use_external_stream", use_external_stream_ ? "true" : "false"});
os.InsertRow(
{"thread_local_stream", thread_local_stream_ ? "true" : "false"});

Expand Down

0 comments on commit e6aac2b

Please sign in to comment.