diff --git a/CMakeLists.txt b/CMakeLists.txt index ff49ba164dd7f..51c0ef35f1efa 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -395,6 +395,11 @@ if(WITH_DISTRIBUTE) MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.") set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE) endif() + if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496) + # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496 + MESSAGE(WARNING "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496. Force WITH_PSCORE=OFF.") + set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496" FORCE) + endif() endif() include(third_party) # download, build, install third_party, Contains about 20+ dependencies diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 514f5ea9deaa3..14cb9e6f6be5a 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -18,6 +18,33 @@ include_directories(${ROCM_PATH}/include) message(STATUS "HIP version: ${HIP_VERSION}") message(STATUS "HIP_CLANG_PATH: ${HIP_CLANG_PATH}") +macro(find_hip_version hip_header_file) + file(READ ${hip_header_file} HIP_VERSION_FILE_CONTENTS) + + string(REGEX MATCH "define HIP_VERSION_MAJOR +([0-9]+)" HIP_MAJOR_VERSION + "${HIP_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define HIP_VERSION_MAJOR +([0-9]+)" "\\1" + HIP_MAJOR_VERSION "${HIP_MAJOR_VERSION}") + string(REGEX MATCH "define HIP_VERSION_MINOR +([0-9]+)" HIP_MINOR_VERSION + "${HIP_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define HIP_VERSION_MINOR +([0-9]+)" "\\1" + HIP_MINOR_VERSION "${HIP_MINOR_VERSION}") + string(REGEX MATCH "define HIP_VERSION_PATCH +([0-9]+)" HIP_PATCH_VERSION + "${HIP_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define HIP_VERSION_PATCH +([0-9]+)" "\\1" + HIP_PATCH_VERSION "${HIP_PATCH_VERSION}") + + if(NOT HIP_MAJOR_VERSION) + set(HIP_VERSION "???") + message(WARNING "Cannot find HIP version in ${HIP_PATH}/include/hip/hip_version.h") + else() + math(EXPR HIP_VERSION "${HIP_MAJOR_VERSION} * 10000000 + ${HIP_MINOR_VERSION} * 100000 + ${HIP_PATCH_VERSION}") + message(STATUS "Current HIP header is ${HIP_PATH}/include/hip/hip_version.h " + "Current HIP version is v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}.${HIP_PATCH_VERSION}. ") + endif() +endmacro() +find_hip_version(${HIP_PATH}/include/hip/hip_version.h) + macro(find_package_and_include PACKAGE_NAME) find_package("${PACKAGE_NAME}" REQUIRED) include_directories("${ROCM_PATH}/${PACKAGE_NAME}/include") @@ -71,8 +98,10 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS}) # host linker to link. list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc) list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908) list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc) list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906) +list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908) if(HIP_COMPILER STREQUAL clang) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index af2be77d0a63d..a52047e16167d 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -416,7 +416,7 @@ function(version version_file) endif() if(WITH_ROCM) file(APPEND ${version_file} - "HIP version: ${HIP_VERSION}\n" + "HIP version: v${HIP_MAJOR_VERSION}.${HIP_MINOR_VERSION}\n" "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") endif() if(WITH_ASCEND_CL) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index c8ef4ad16ea9d..eb6fa4ee13c81 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -357,7 +357,7 @@ if (WITH_PSCORE) include(external/libmct) # download, build, install libmct list(APPEND third_party_deps extern_libmct) - include(external/rocksdb) # download, build, install libmct + include(external/rocksdb) # download, build, install rocksdb list(APPEND third_party_deps extern_rocksdb) endif() diff --git a/paddle/fluid/distributed/ps/service/heter_client.h b/paddle/fluid/distributed/ps/service/heter_client.h index 36bafc943701f..efaa48470a8bd 100644 --- a/paddle/fluid/distributed/ps/service/heter_client.h +++ b/paddle/fluid/distributed/ps/service/heter_client.h @@ -171,19 +171,16 @@ class HeterClient { // switch client singleton static std::shared_ptr GetSwitchInstance( const std::vector& peer_endpoints, int32_t peer_role) { + std::unique_lock lock(mtx_); + if (peer_endpoints.empty()) { + VLOG(4) << "init switch client failed, null peer_endpoints"; + } + VLOG(4) << "peer role is: " << peer_role + << ", addr is: " << peer_endpoints[0]; if (switch_s_instance_ == nullptr) { - std::unique_lock lock(mtx_); - if (peer_endpoints.empty()) { - VLOG(4) << "init switch client failed, null peer_endpoints"; - } - VLOG(4) << "peer role is: " << peer_role - << ", addr is: " << peer_endpoints[0]; - if (switch_s_instance_ == nullptr) { - switch_s_instance_.reset(new HeterClient()); - switch_s_instance_->SetPeerSwitchList(peer_endpoints); - switch_s_instance_->InitClientChannels(false, peer_endpoints, - peer_role); - } + switch_s_instance_.reset(new HeterClient()); + switch_s_instance_->SetPeerSwitchList(peer_endpoints); + switch_s_instance_->InitClientChannels(false, peer_endpoints, peer_role); } return switch_s_instance_; } diff --git a/paddle/fluid/distributed/ps/service/heter_server.cc b/paddle/fluid/distributed/ps/service/heter_server.cc index 0753a6799c1be..fd38a030ff366 100755 --- a/paddle/fluid/distributed/ps/service/heter_server.cc +++ b/paddle/fluid/distributed/ps/service/heter_server.cc @@ -125,6 +125,9 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard( brpc::Controller* cntl) { VLOG(4) << "entering SaveInSwitchWithShard"; int32_t group_id = request->group_id(); + if (group_id >= FLAGS_heter_world_size) { + LOG(ERROR) << "group id exceed maxmium"; + } auto& local_shard = _local_shards[group_id]; auto& request_io_buffer = cntl->request_attachment(); butil::IOBufBytesIterator io_buffer_itr(request_io_buffer); @@ -132,11 +135,11 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard( const auto& var_name = request->send_var_names(idx); const auto& var_size = request->vars_len(idx); WaitForVarsConsumed(group_id, var_name); + std::unique_lock lk(scope_mutex_); auto& value = local_shard[var_name]; value.resize(var_size); io_buffer_itr.copy_and_forward(reinterpret_cast(value.data()), var_size); - std::unique_lock lk(scope_mutex_); vars_ready_flag[group_id][var_name] = 1; VLOG(4) << "saved var_name: " << var_name << "is saved ready!"; } @@ -162,11 +165,11 @@ int SendAndRecvVariableHandler::QueryInSwitchWithShard( VLOG(4) << "req var name: " << req_var_name; response->add_send_var_names(req_var_name); WaitForVarsProduced(group_id, req_var_name); + std::unique_lock lk(scope_mutex_); auto itr = local_shard.find(req_var_name); auto& value = itr.value(); response_io_buffer.append(value.data(), value.size()); value.resize(0); // 清空内存 - std::unique_lock lk(scope_mutex_); vars_ready_flag[group_id][req_var_name] = 0; VLOG(4) << "query var_name: " << req_var_name << "is consumed ready!"; } diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 08e8f2baef6a0..857f1be1f7ae0 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -34,7 +34,42 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, *tensor = t; } else { // Accumulation - paddle::imperative::TensorAdd(t, tensor); + PADDLE_ENFORCE_EQ(t.initialized(), true, + paddle::platform::errors::Fatal( + "We can only accumulate initialized tensor, but we " + "got tensor: %s is empty please check you network " + "and make sure it creates grads.", + t.name())); + PADDLE_ENFORCE_NOT_NULL( + tensor, paddle::platform::errors::Fatal( + "We can only accumulate initialized tensor to non-nullptr " + "tensor but we got nullptr please check you network " + "and make sure it creates grads.")); + + if (t.is_dense_tensor()) { + if (tensor->is_dense_tensor()) { + paddle::imperative::TensorAdd(t, tensor); + + } else { + // TODO(jiabin): Support Other TensorBase later + // TODO(zhanlve): Replace SelectedRowsAddTensor with + // add_dygraph_function once it's supported + paddle::experimental::Tensor new_buffer( + std::make_shared(), "tmp_accumulator"); + paddle::imperative::SelectedRowsAddTensor(*tensor, t, &new_buffer); + tensor->set_impl(new_buffer.impl()); + } + } else { + // TODO(jiabin): Support Other TensorBase later + // TODO(zhanlve): Replace SelectedRowsAddTensor with add_dygraph_function + // once it's supported + if (tensor->is_dense_tensor()) { + paddle::imperative::SelectedRowsAddToTensor(t, tensor); + } else { + *tensor = std::move(*paddle::imperative::SelectedRowsMerge< + paddle::experimental::Tensor>(t, *tensor)); + } + } } } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 448fa546255bb..9ad628ef515b1 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -416,10 +416,6 @@ def DetermineForwardPositionMap(self, forward_inputs_list, self.forward_outputs_position_map[ return_name] = [return_type, return_pos] - print("Generated Forward Input Position Map: ", - self.forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - self.forward_outputs_position_map) class YamlGeneratorBase: diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 9f48cce878904..9d95b9488d298 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -551,12 +551,6 @@ def CollectBackwardInfo(self): self.backward_inputs_list, self.backward_attrs_list, self.backward_returns_list = ParseYamlBackward( backward_args_str, backward_returns_str) - logging.info( - f"Parsed Backward Inputs List: {self.backward_inputs_list}") - logging.info(f"Prased Backward Attrs List: {self.backward_attrs_list}") - logging.info( - f"Parsed Backward Returns List: {self.backward_returns_list}") - def CollectForwardInfoFromBackwardContents(self): backward_forward_str = self.backward_forward_str @@ -628,15 +622,6 @@ def SlotNameMatching(self): backward_output_type, matched_forward_input_pos, backward_output_pos ] - logging.info( - f"Generated Backward Fwd Input Map: {self.backward_forward_inputs_map}" - ) - logging.info( - f"Generated Backward Grad Input Map: {self.backward_grad_inputs_map}" - ) - logging.info( - f"Generated Backward Grad Output Map: {self.backward_grad_outputs_map}" - ) def GenerateNodeCreationCodes(self): forward_api_name = self.forward_api_name @@ -865,7 +850,10 @@ def GenerateForwardDefinition(self, is_inplaced): f"if ({name}.get_ptr() != nullptr) amp_tensors_vector.push_back({{ *({name}.get_ptr()) }});\n" ) amp_autocast_optional_list.append( - f"auto NEW_{name} = ({name}.get_ptr() != nullptr) ? paddle::make_optional(egr::EagerAmpAutoCast(\"{name}\", *({name}.get_ptr()), amp_dst_dtype, op_name)) : {name};\n" + f"auto NEW_{name}_temp_tensor = ({name}.get_ptr() != nullptr) ? egr::EagerAmpAutoCast(\"{name}\", *({name}.get_ptr()), amp_dst_dtype, op_name) : paddle::experimental::Tensor();\n" + ) + amp_autocast_optional_list.append( + f"auto NEW_{name} = ({name}.get_ptr() != nullptr) ? paddle::make_optional(NEW_{name}_temp_tensor) : {name};\n" ) else: if is_inplaced and inplace_map and name in inplace_map.keys( @@ -1041,11 +1029,6 @@ def GenerateForwardDefinition(self, is_inplaced): returns_str) self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n" - logging.info( - f"Generated Forward Definition: {self.forward_definition_str}") - logging.info( - f"Generated Forward Declaration: {self.forward_declaration_str}") - def GenerateInplacedForwardDygraphFunctions(self): # Inplaced Version Dygraph Function Generation forward_api_name = self.forward_api_name @@ -1231,8 +1214,6 @@ def GenerateNodeDeclaration(self): set_attribute_methods_str, tensor_wrapper_members_str, attribute_members_str) - logging.info(f"Generated Node Declaration: {self.node_declaration_str}") - def GenerateNodeDefinition(self, grad_node_creation_str): namespace = self.namespace forward_api_name = self.forward_api_name @@ -1436,8 +1417,6 @@ def GenerateNodeDefinition(self, grad_node_creation_str): outputs_autograd_meta_str, compute_require_grad_str, grad_node_creation_str, returns_str) - logging.info(f"Generated Node Definition: {self.node_definition_str}") - def run(self): super().run() diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 3608fe7e40893..b86685c205a5c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -399,35 +399,15 @@ def run(self): # Initialized orig_forward_inputs_list, orig_forward_returns_list, orig_forward_attrs_list self.CollectOriginalForwardInfo() - logging.info( - f"Parsed Original Forward Inputs List: \n{self.orig_forward_inputs_list}" - ) - logging.info( - f"Prased Original Forward Attrs List: \n{self.orig_forward_attrs_list}" - ) - logging.info( - f"Parsed Original Forward Returns List: \n{self.orig_forward_returns_list}" - ) if SkipAPIGeneration(self.forward_api_name): return False # Initialized forward_inputs_position_map, forward_outputs_position_map self.DetermineForwardPositionMap(self.orig_forward_inputs_list, self.orig_forward_returns_list) - logging.info( - f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" - ) - logging.info( - f"Generated Forward Output Position Map: {self.forward_outputs_position_map}" - ) # Code Generation self.GeneratePythonCFunction() - logging.info( - f"Generated Python-C Function: {self.python_c_function_str}") - logging.info( - f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}" - ) return True @@ -536,8 +516,6 @@ def GeneratePythonCFile(filepath, python_c_str): python_c_str = GeneratePythonCWrappers(generated_python_c_functions, generated_python_c_registration) - logging.info(f"Generated Python-C Codes: \n{python_c_str}") - output_path = args.output_path for path in [output_path]: if os.path.exists(path): diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 7a4e7f81611d1..c5a121067be72 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -66,68 +66,69 @@ class GeneralGrad { "stop_gradient=True.", msg, i)); if (is_no_grad_vars) { - (no_grad_var_nodes_inputmeta_map)[target_node] = auto_grad_meta; + (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta; } else { // normal input - (input_target_nodes_inputmeta_map)[target_node] = auto_grad_meta; + (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta; } } } } - // Purify potential_startup_nodes, remove nodes those are the same as + // Purify potential_startup_nodes_, remove nodes those are the same as // input_target_nodes void PurifyPotentialStartUpNodes() { VLOG(6) << "Running in PurifyPotentialStartUpNodes"; - if (input_target_nodes_inputmeta_map.empty()) return; + if (input_target_nodes_inputmeta_map_.empty()) return; std::unordered_set potential_startup_nodes_to_be_erased; - for (auto startup_op : potential_startup_nodes) { - auto iter = input_target_nodes_inputmeta_map.find(startup_op); - if (iter != input_target_nodes_inputmeta_map.end()) { + for (auto startup_op : potential_startup_nodes_) { + auto iter = input_target_nodes_inputmeta_map_.find(startup_op); + if (iter != input_target_nodes_inputmeta_map_.end()) { potential_startup_nodes_to_be_erased.emplace(iter->first); } } if (!potential_startup_nodes_to_be_erased.empty()) { for (auto nodes : potential_startup_nodes_to_be_erased) { - potential_startup_nodes.erase(nodes); + potential_startup_nodes_.erase(nodes); } } } // Remove some nodes those doesn't need to be - // stored in potential_stop_nodes、potential_startup_nodes + // stored in potential_stop_nodes_、potential_startup_nodes_ void UpdateGraphInfo() { - // Updated potential_sotp_nodes by depending_nodes, + // Updated potential_sotp_nodes by depending_nodes_, // make sure the path from root to target_node is ok - std::unordered_set _startup_ops; + std::unordered_set startup_ops; VLOG(6) << "Running in UpdateGraphInfo"; std::queue queue; - for (auto& target_nodes_inputmeta_pair : input_target_nodes_inputmeta_map) { + for (auto& target_nodes_inputmeta_pair : + input_target_nodes_inputmeta_map_) { queue.emplace(target_nodes_inputmeta_pair.first); } while (!queue.empty()) { auto* target_node = queue.front(); queue.pop(); - if (!(depending_nodes)[target_node].empty()) { - auto precedding_nodes = (depending_nodes)[target_node]; + if (!(depending_nodes_)[target_node].empty()) { + auto precedding_nodes = (depending_nodes_)[target_node]; for (auto pre_nodes : precedding_nodes) { queue.emplace(pre_nodes); - if (potential_stop_nodes.find(pre_nodes) != - potential_stop_nodes.end()) { - potential_stop_nodes.erase(pre_nodes); + if (potential_stop_nodes_.find(pre_nodes) != + potential_stop_nodes_.end()) { + potential_stop_nodes_.erase(pre_nodes); } } } else { // startup_ops have no precedding nodes - VLOG(6) << "Emplace _startup_ops"; - _startup_ops.emplace(target_node); + VLOG(6) << "Emplace startup_ops"; + startup_ops.emplace(target_node); } } - // Purify potential_startup_nodes again, remove some + // Purify potential_startup_nodes_ again, remove some // potential startup_nodes that unreach to input target nodes - if (!_startup_ops.empty()) { + if (!startup_ops.empty()) { std::unordered_set potential_startup_nodes_to_be_erased; - for (auto node : potential_startup_nodes) { - if (_startup_ops.count(node) == 0) { + for (auto node : potential_startup_nodes_) { + if (startup_ops.count(node) == 0) { VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; potential_startup_nodes_to_be_erased.emplace(node); } @@ -135,14 +136,14 @@ class GeneralGrad { if (!potential_startup_nodes_to_be_erased.empty()) { for (auto node : potential_startup_nodes_to_be_erased) { VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; - potential_startup_nodes.erase(node); + potential_startup_nodes_.erase(node); } } } } // Get Graph Info Betweent input target GradNode and outputs, - // record depending_nodes、potential_stop_nodes、potential_startup_nodes + // record depending_nodes_、potential_stop_nodes_、potential_startup_nodes_ void GetGraphInfoBetweenTargets(const std::queue& init_queue) { VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; @@ -164,9 +165,9 @@ class GeneralGrad { visited.insert(node); // Check node is target_nodes or not, if node is not target_node, - // all the next_node will be marked in potential_stop_nodes + // all the next_node will be marked in potential_stop_nodes_ bool is_potential_stop_nodes = - input_target_nodes_inputmeta_map.count(node); + input_target_nodes_inputmeta_map_.count(node); // Find and append next nodes const paddle::small_vector, @@ -186,40 +187,41 @@ class GeneralGrad { // all the next_nodes of current node will be inserted to // potential_stop_node if (is_potential_stop_nodes) { - potential_stop_nodes.emplace(next_node); + potential_stop_nodes_.emplace(next_node); } // Update in_degree - if (!node_in_degree_map.count(next_node)) + if (!node_in_degree_map.count(next_node)) { node_in_degree_map[next_node] = 0; + } node_in_degree_map[next_node]++; // Record depending relationship - (depending_nodes)[next_node].emplace(node); + (depending_nodes_)[next_node].emplace(node); queue.push(next_node); } } } // Update Graph Info, remove some nodes in - // potential_stop_nodes、potential_startup_nodes、 + // potential_stop_nodes_、potential_startup_nodes_、 UpdateGraphInfo(); } void ModifyReadyQueue(std::queue* queue) { std::queue tmp_queue; - for (auto nodes : potential_startup_nodes) { + for (auto nodes : potential_startup_nodes_) { tmp_queue.emplace(nodes); } tmp_queue.swap(*queue); } - // Set result for input target grad_var when potential_startup_nodes is empty + // Set result for input target grad_var when potential_startup_nodes_ is empty void SetResultForInputTargetVar( const std::unordered_map>& node_input_buffers_dict) { - if (potential_startup_nodes.size() == 0) { - for (auto input_target_node : *GetInPutTargetNodesInputMetaMap()) { + if (potential_startup_nodes_.size() == 0) { + for (auto input_target_node : *GetInputTargetNodesInputMetaMap()) { // out rank_info of forward op auto rank_info = input_target_node.second->OutRankInfo(); auto iter = node_input_buffers_dict.find(input_target_node.first); @@ -227,7 +229,7 @@ class GeneralGrad { auto& target_result = (iter->second)->Buffers()[rank_info.first][rank_info.second]; // save the target result - results_map[input_target_node.first] = target_result; + results_map_[input_target_node.first] = target_result; } } } @@ -236,8 +238,8 @@ class GeneralGrad { // Set input target grad_var from node_input_buffer by inputmeta void SetResultForInputTargetVar(GradTensorHolder input_buffers, GradNodeBase* node) { - auto iter = GetInPutTargetNodesInputMetaMap()->find(node); - if (iter != GetInPutTargetNodesInputMetaMap()->end()) { + auto iter = GetInputTargetNodesInputMetaMap()->find(node); + if (iter != GetInputTargetNodesInputMetaMap()->end()) { VLOG(6) << "Get target result by by inputmeta"; // out rank_info of forward op auto rank_info = (iter->second)->OutRankInfo(); @@ -245,7 +247,7 @@ class GeneralGrad { auto& target_result = input_buffers.Buffers()[rank_info.first][rank_info.second]; // save the target result - results_map[node] = target_result; + results_map_[node] = target_result; } } @@ -271,8 +273,8 @@ class GeneralGrad { "input"; } - auto iter = results_map.find(target_node); - if (iter != results_map.end()) { + auto iter = results_map_.find(target_node); + if (iter != results_map_.end()) { // set StopGradient = !create_graph AutogradMeta* tensor_auto_grad_meta = EagerUtils::autograd_meta(&(iter->second)); @@ -303,12 +305,12 @@ class GeneralGrad { GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */); // Get inputs's GradNodes and InputMeta Info GetTargetNodesInfo(inputs, false /* is_no_grad_vars */); - // Purify potential_startup_ops, remove those nodes that are the same as + // Purify potentialstartup_ops, remove those nodes that are the same as // input_target_nodes PurifyPotentialStartUpNodes(); // Get Graph Info Betweent input target gradnode and outputs - // Record the depending_nodes and - // potential_stop_nodes、potential_startup_nodes + // Record the depending_nodes_ and + // potential_stop_nodes_、potential_startup_nodes_ GetGraphInfoBetweenTargets(*queue); // Reset queue. Queue is empty only when // 1.input equals to output. 2.input can not reach to output. @@ -318,34 +320,34 @@ class GeneralGrad { } bool IsPotentialStopNodes(GradNodeBase* node) { - return potential_stop_nodes.count(node); + return potential_stop_nodes_.count(node); } std::unordered_map* GetNoGradVarNodesInputMetaMap() { - return &no_grad_var_nodes_inputmeta_map; + return &no_grad_var_nodes_inputmeta_map_; } std::unordered_map* - GetInPutTargetNodesInputMetaMap() { - return &input_target_nodes_inputmeta_map; + GetInputTargetNodesInputMetaMap() { + return &input_target_nodes_inputmeta_map_; } std::unordered_set* GetPotentialStopNodes() { - return &potential_stop_nodes; + return &potential_stop_nodes_; } std::unordered_set* GetPotentialStartupNodes() { - return &potential_startup_nodes; + return &potential_startup_nodes_; } void Clear() { - no_grad_var_nodes_inputmeta_map.clear(); - input_target_nodes_inputmeta_map.clear(); - potential_startup_nodes.clear(); - potential_stop_nodes.clear(); - depending_nodes.clear(); - results_map.clear(); + no_grad_var_nodes_inputmeta_map_.clear(); + input_target_nodes_inputmeta_map_.clear(); + potential_startup_nodes_.clear(); + potential_stop_nodes_.clear(); + depending_nodes_.clear(); + results_map_.clear(); copied_grad_nodes_.clear(); orig_to_copied_node_mapping_.clear(); } @@ -426,18 +428,18 @@ class GeneralGrad { static GeneralGrad* general_grad_; // no_grad_vars's GradNode and GradNode's InputMeta. std::unordered_map - no_grad_var_nodes_inputmeta_map; + no_grad_var_nodes_inputmeta_map_; // inputs's GradNode and GradNode's InputMeta. std::unordered_map - input_target_nodes_inputmeta_map; + input_target_nodes_inputmeta_map_; // Record all the potential startup_nodes, will be changed. - std::unordered_set potential_startup_nodes; + std::unordered_set potential_startup_nodes_; // Record all the potential stop nodes, will be changed. - std::unordered_set potential_stop_nodes; + std::unordered_set potential_stop_nodes_; std::unordered_map /* pre nodes */> - depending_nodes; - std::unordered_map results_map; + depending_nodes_; + std::unordered_map results_map_; std::vector> copied_grad_nodes_; std::unordered_map> @@ -619,7 +621,7 @@ std::vector RunBackward( // GradTensorHolder will initialize another tensor with same tensortype, // datatype and dims but filled with 1.0 node_input_buffers_dict[grad_node]->CopyValueFromTensor( - input_info.first, input_info.second, tensor, true /*fill_one=true*/); + input_info.first, input_info.second, tensor, /*fill_one=*/true); } // Prepare queue, potential startup_nodes @@ -657,7 +659,7 @@ std::vector RunBackward( VLOG(6) << "Running GradNode:" << node->name(); paddle::platform::RecordEvent node_record_event( - std::string((*node).name()) + " grad_node", + std::string((*node).name()), paddle::platform::TracerEventType::Operator, 1); if (queue.size() > 1 && node_in_degree_map[node] != 0) { @@ -667,14 +669,15 @@ std::vector RunBackward( queue.pop(); // Run node: This is where Hook happens - PADDLE_ENFORCE( - node_input_buffers_dict.count(node), + auto node_input_buffer_iter = node_input_buffers_dict.find(node); + PADDLE_ENFORCE_NE( + node_input_buffer_iter, node_input_buffers_dict.end(), paddle::platform::errors::Fatal( "Unable to find next node in the GradTensorHolder \n" "Trying to run Node without configuring its GradTensorHolder.")); std::unique_ptr node_input_buffer = - std::move(node_input_buffers_dict[node]); + std::move(node_input_buffer_iter->second); // Set input target grad_var from node_input_buffer by inputmeta if (!inputs.empty() && is_general_grad) { @@ -715,8 +718,7 @@ std::vector RunBackward( } // TODO(jiabin): Should we erase it or find a more efficient way. - - node_input_buffers_dict.erase(node); + node_input_buffers_dict.erase(node_input_buffer_iter); // Prepare GradTensorHolder for next node const paddle::small_vector, kSlotSmallVectorSize>& @@ -736,8 +738,7 @@ std::vector RunBackward( } auto edge_rank = edge.GetEdgeRankInfo(); // Since we make edge has as same rank as bwd outputs, we indexing them - // with - // the same rank(i, j) + // with the same rank(i, j) auto next_node_shared = edge.GetMutableGradNode(); // Next node could be nullptr if it is leaf tensor with no diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 610b177829e2f..2d4db8cb52974 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -36,6 +36,31 @@ **/ namespace egr { +static void CheckTensor(const paddle::experimental::Tensor& pre, + const paddle::experimental::Tensor& post) { + if (!pre.initialized() && post.initialized()) { + PADDLE_THROW(paddle::platform::errors::PermissionDenied( + "The tensor in before and after hook are not consistent")); + } + if (pre.initialized() && post.initialized()) { + VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " " + << paddle::framework::DataType2String(post.dtype()); + PADDLE_ENFORCE_EQ( + pre.dtype(), post.dtype(), + paddle::platform::errors::PermissionDenied( + "The dtype of tensor before(%s) and after(%s) hook are not " + "consistent", + paddle::framework::DataType2String(pre.dtype()), + paddle::framework::DataType2String(post.dtype()))); + PADDLE_ENFORCE_EQ( + pre.place(), post.place(), + paddle::platform::errors::PermissionDenied( + "The place of tensor before(%s) and after(%s) " + "hook are not consistent", + pre.place().DebugString(), post.place().DebugString())); + } +} + GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); @@ -271,7 +296,7 @@ void GradNodeBase::SetGradOutMeta( // Only Copy Meta phi::DenseTensor* dense_tensor = static_cast(fwd_in_tensor.impl().get()); - PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + PADDLE_ENFORCE_NE(dense_tensor->dtype(), phi::DataType::UNDEFINED, paddle::platform::errors::Fatal( "Attempting to copy DenseTensorMeta " "with phi::DataType::UNDEFINED," diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 6fdee203c196c..747e98b846616 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -30,32 +30,23 @@ namespace egr { * The GradNodeBase will be held in autograd_meta, and it is also a member of * Edge, which indicates the edge of backward graph. * - * TODO:(yangzhanlue) GradNodeBase will also in charge of get the correct input + * TODO(yangzhanlue): GradNodeBase will also in charge of get the correct input * from GradOpDescMaker to GradNodeBase. * - * NOTE:GradNodeBase has a method named run, this method should be overrided by - * the - * specific derived class, it will prepare backward inputs and double backward's - * depends. Then, it will call C++ API of backward kernel functions to finish - * backward computation. + * NOTE: GradNodeBase has a method named run, this method should be overrided by + * the specific derived class, it will prepare backward inputs and double + * backward's depends. Then, it will call C++ API of backward kernel functions + * to finish backward computation. * - * NOTE:GradNodeBase holds its own inputs and Outputs + * NOTE: GradNodeBase holds its own inputs and Outputs * * Edge is defined to descripe depend of backward, an Edge is what linked - * between two - * node, it should contain a Node and rank of this Node (this is used to - * indicate which - * input of grad this edge belong). - * */ + * between two node, it should contain a Node and rank of this Node (this is + * used to indicate which input of grad this edge belong). + **/ class AutogradMeta; class GradNodeBase; -/** - * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle - * has lots of operators - * whose backward logic is depends on if it has some specific inputs or outputs. - * So, we need a meta info - * to record it's needs. - * **/ + class Edge { public: // Default constructor for Edges in order to construct it for AutogradMeta @@ -64,8 +55,7 @@ class Edge { // In real use cases we should create Edge from grad node and input rank which // indicate which edge it is. // Since we have slot design in operators we will have to locate an edge with - // slot - // and rank. + // slot and rank. Edge(const std::shared_ptr& grad_node, size_t in_slot_id, size_t in_rank) : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {} @@ -120,6 +110,12 @@ class Edge { size_t in_rank_; std::shared_ptr grad_node_{nullptr}; }; + +/** + * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle + * has lots of operators whose backward logic is depends on if it has some + * specific inputs or outputs. So, we need a meta info to record it's needs. + **/ class GradSlotMeta { public: GradSlotMeta() = default; @@ -171,16 +167,13 @@ class GradNodeBase { /** * operator() designed to contian the real backward execution logic, it should - * be - * overrided by derived class defined for each operator. It accepts a vector - * of - * Tensor which contains grads input of current operator + * be overrided by derived class defined for each operator. It accepts a + * vector of Tensor which contains grads input of current operator * * Note: why we need backward inputs and outputs construct as vector of vector * of paddle::experimental::Tensor? * Since all of paddle op composite in form of {"Slot name ", vector}, - * so, vector of vector - * is better choice to fit this format. + * so, vector of vector is better choice to fit this format. * **/ virtual paddle::small_vector, kSlotSmallVectorSize> @@ -294,36 +287,12 @@ class GradNodeBase { /* slot id */ size_t, /* rank */ size_t, /* hook */ std::shared_ptr>> gradient_hooks_; + int64_t next_hook_id_{0}; // We handle complex to real conversion only if any complex GradIn is involved bool need_complex_to_real_ = false; - int64_t next_hook_id_{0}; + bool is_tensor_wrappers_cleared_ = false; }; -inline void CheckTensor(const paddle::experimental::Tensor& pre, - const paddle::experimental::Tensor& post) { - if (!pre.initialized() && post.initialized()) { - PADDLE_THROW(paddle::platform::errors::PermissionDenied( - "The tensor in before and after hook are not consistent")); - } - if (pre.initialized() && post.initialized()) { - VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " " - << paddle::framework::DataType2String(post.dtype()); - PADDLE_ENFORCE_EQ( - pre.dtype(), post.dtype(), - paddle::platform::errors::PermissionDenied( - "The dtype of tensor before(%s) and after(%s) hook are not " - "consistent", - paddle::framework::DataType2String(pre.dtype()), - paddle::framework::DataType2String(post.dtype()))); - PADDLE_ENFORCE_EQ( - pre.place(), post.place(), - paddle::platform::errors::PermissionDenied( - "The place of tensor before(%s) and after(%s) " - "hook are not consistent", - pre.place().DebugString(), post.place().DebugString())); - } -} - } // namespace egr diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc index fad4fd50a5e3e..a00b292fe0915 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.cc +++ b/paddle/fluid/eager/pylayer/py_layer_node.cc @@ -106,8 +106,6 @@ GradNodePyLayer::operator()( pybind11::detail::error_string().c_str())); } - outputs_ = outputs; - VLOG(6) << "PyLayer backward function finish..."; PyObject* outputs_tuple = nullptr; @@ -165,6 +163,9 @@ GradNodePyLayer::operator()( if (!PyTuple_Check(outputs)) { Py_XDECREF(outputs_tuple); } + Py_XDECREF(outputs); + Py_XDECREF(ctx_); + ctx_ = nullptr; return grad_out; } diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h index b477d7a9ad996..c1a8c6e626b4f 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.h +++ b/paddle/fluid/eager/pylayer/py_layer_node.h @@ -32,10 +32,7 @@ class GradNodePyLayer : public GradNodeBase { ctx_ = ctx; } - ~GradNodePyLayer() override { - Py_DECREF(ctx_); - Py_XDECREF(outputs_); - }; + ~GradNodePyLayer() override { Py_XDECREF(ctx_); }; virtual paddle::small_vector, kSlotSmallVectorSize> @@ -50,9 +47,6 @@ class GradNodePyLayer : public GradNodeBase { return "GradNodePyLayer_" + std::string(Py_TYPE(ctx_)->tp_name); } - // for paddle.grad get result - PyObject* GetMutableOutputs() { return outputs_; } - void SaveForwardOutputsMeta( const std::vector>& outputs_tensor) { @@ -81,7 +75,6 @@ class GradNodePyLayer : public GradNodeBase { private: PyObject* ctx_{nullptr}; - PyObject* outputs_{nullptr}; std::vector> forward_outputs_meta_; std::vector> forward_outputs_place_; }; diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index f13fcfa990057..28b116b41ea91 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -88,6 +88,7 @@ class TensorWrapper { } else { intermidiate_tensor_.set_impl(tensor.impl()); } + // TODO(jiabin): This may has server performance issue intermidiate_tensor_.set_name(tensor.name() + "@Saved"); @@ -118,24 +119,25 @@ class TensorWrapper { paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; std::shared_ptr new_grad_node = weak_grad_node_.lock(); - if (new_grad_node) { - VLOG(3) << "Recovered TensorWrapper with GradNode " - << new_grad_node->name() << " addr: " << new_grad_node.get(); - } else { - VLOG(3) << "Recovered TensorWrapper with Empth GradNode"; - } auto* intermediate_autograd_meta = EagerUtils::unsafe_autograd_meta(intermidiate_tensor_); auto p_ab_autograd_meta = std::make_shared(*intermediate_autograd_meta); if (new_grad_node) { + VLOG(3) << "Recovered TensorWrapper with GradNode " + << new_grad_node->name() << " addr: " << new_grad_node.get(); p_ab_autograd_meta->SetGradNode(new_grad_node); + } else { + VLOG(3) << "Recovered TensorWrapper with Empth GradNode"; } recovered_tensor.set_autograd_meta(p_ab_autograd_meta); return recovered_tensor; } } + void clear() { intermidiate_tensor_.reset(); } + + private: void check_inplace_version() { if (no_need_buffer_) { VLOG(6) << "There's no need to check inplace_version because " @@ -170,8 +172,6 @@ class TensorWrapper { } } - void clear() { intermidiate_tensor_.reset(); } - private: bool full_reserved_ = false; bool no_need_buffer_ = false; diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index f9f00749dc87b..c159084d683e8 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -21,14 +21,219 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" -#include "paddle/fluid/eager/utils.h" - #include "paddle/fluid/eager/hooks.h" +#include "paddle/fluid/eager/utils.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/selected_rows.h" // TODO(jiabin): remove nolint here!!! using namespace egr; // NOLINT +TEST(AccumulationNode, SelectedRowsAddToTensor) { + // Construct Eager Tensor + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::vector rows = {0}; + std::shared_ptr sr0 = + std::make_shared(rows, 1); + sr0->mutable_value()->Resize(phi::make_ddim({1, 1})); + sr0->mutable_value()->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(10.0f); + paddle::experimental::Tensor et0 = paddle::experimental::Tensor(sr0); + std::shared_ptr dt1 = std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + dt1->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(20.0f); + paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); + std::shared_ptr input_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor input_et = + paddle::experimental::Tensor(input_dt); + auto grad_meta = EagerUtils::autograd_meta(&input_et); + // Initialize Grad Tensor + std::shared_ptr grad_dt = + std::make_shared(rows, 1); + grad_dt->mutable_value()->Resize(phi::make_ddim({1, 1})); + grad_dt->mutable_value()->mutable_data( + paddle::platform::CPUPlace())[0] = static_cast(0.0f); + grad_meta->MutableGrad()->set_impl(grad_dt); + // AccumulationNode + auto node = std::make_shared(grad_meta); + grad_meta->SetGradNode(node); + grad_meta->SetStopGradient(false); + // operator() + paddle::small_vector, + kSlotSmallVectorSize> + et0_vec = {{et0}}; + paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0]; + auto* ret_et0_ptr = + std::dynamic_pointer_cast(ret_et0.impl()) + ->value() + .data(); + CHECK_EQ(ret_et0_ptr[0], static_cast(10.0f)); + paddle::small_vector, + kSlotSmallVectorSize> + et1_vec = {{et1}}; + paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0]; + auto* ret_et1_ptr = + std::dynamic_pointer_cast(ret_et1.impl()) + ->data(); + CHECK_EQ(ret_et1_ptr[0], static_cast(20.0f)); + // Check Retain Grad + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->value() + .data()[0], + static_cast(10.0f)); + paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et); + auto* grad_ptr = + std::dynamic_pointer_cast(grad->impl())->data(); + CHECK_EQ(grad_ptr[0], static_cast(30.0f)); +} + +TEST(AccumulationNode, SelectedRowsMerge) { + // Construct Eager Tensor + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::vector rows = {0}; + std::shared_ptr sr0 = + std::make_shared(rows, 1); + sr0->mutable_value()->Resize(phi::make_ddim({1, 1})); + sr0->mutable_value()->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(10.0f); + paddle::experimental::Tensor et0 = paddle::experimental::Tensor(sr0); + std::shared_ptr sr1 = + std::make_shared(rows, 1); + sr1->mutable_value()->Resize(phi::make_ddim({1, 1})); + sr1->mutable_value()->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(20.0f); + paddle::experimental::Tensor et1 = paddle::experimental::Tensor(sr1); + std::shared_ptr input_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor input_et = + paddle::experimental::Tensor(input_dt); + auto grad_meta = EagerUtils::autograd_meta(&input_et); + // Initialize Grad Tensor + std::shared_ptr grad_dt = + std::make_shared(rows, 1); + grad_dt->mutable_value()->Resize(phi::make_ddim({1, 1})); + grad_dt->mutable_value()->mutable_data( + paddle::platform::CPUPlace())[0] = static_cast(0.0f); + grad_meta->MutableGrad()->set_impl(grad_dt); + // AccumulationNode + auto node = std::make_shared(grad_meta); + grad_meta->SetGradNode(node); + grad_meta->SetStopGradient(false); + // operator() + paddle::small_vector, + kSlotSmallVectorSize> + et0_vec = {{et0}}; + paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0]; + auto* ret_et0_ptr = + std::dynamic_pointer_cast(ret_et0.impl()) + ->value() + .data(); + CHECK_EQ(ret_et0_ptr[0], static_cast(10.0f)); + paddle::small_vector, + kSlotSmallVectorSize> + et1_vec = {{et1}}; + paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0]; + auto* ret_et1_ptr = + std::dynamic_pointer_cast(ret_et1.impl()) + ->value() + .data(); + CHECK_EQ(ret_et1_ptr[0], static_cast(20.0f)); + // Check Retain Grad + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->value() + .data()[0], + static_cast(10.0f)); + paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et); + auto* grad_ptr = std::dynamic_pointer_cast(grad->impl()) + ->value() + .data(); + CHECK_EQ(grad_ptr[0], static_cast(30.0f)); +} + +TEST(AccumulationNode, SelectedRowsAddTensor) { + // Construct Eager Tensor + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::vector rows = {0}; + std::shared_ptr sr0 = + std::make_shared(rows, 1); + sr0->mutable_value()->Resize(phi::make_ddim({1, 1})); + sr0->mutable_value()->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(10.0f); + paddle::experimental::Tensor et0 = paddle::experimental::Tensor(sr0); + std::shared_ptr sr1 = + std::make_shared(rows, 1); + sr1->mutable_value()->Resize(phi::make_ddim({1, 1})); + sr1->mutable_value()->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(20.0f); + paddle::experimental::Tensor et1 = paddle::experimental::Tensor(sr1); + std::shared_ptr input_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor input_et = + paddle::experimental::Tensor(input_dt); + auto grad_meta = EagerUtils::autograd_meta(&input_et); + // Initialize Grad Tensor + std::shared_ptr grad_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + grad_dt->mutable_data(paddle::platform::CPUPlace())[0] = + static_cast(0.0f); + grad_meta->MutableGrad()->set_impl(grad_dt); + // AccumulationNode + auto node = std::make_shared(grad_meta); + grad_meta->SetGradNode(node); + grad_meta->SetStopGradient(false); + // operator() + paddle::small_vector, + kSlotSmallVectorSize> + et0_vec = {{et0}}; + paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0]; + auto* ret_et0_ptr = + std::dynamic_pointer_cast(ret_et0.impl()) + ->value() + .data(); + CHECK_EQ(ret_et0_ptr[0], static_cast(10.0f)); + paddle::small_vector, + kSlotSmallVectorSize> + et1_vec = {{et1}}; + paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0]; + auto* ret_et1_ptr = + std::dynamic_pointer_cast(ret_et1.impl()) + ->value() + .data(); + CHECK_EQ(ret_et1_ptr[0], static_cast(20.0f)); + // Check Retain Grad + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->value() + .data()[0], + static_cast(10.0f)); + paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et); + auto* grad_ptr = + std::dynamic_pointer_cast(grad->impl())->data(); + CHECK_EQ(grad_ptr[0], static_cast(30.0f)); +} TEST(AccumulationNode, Tensor) { // Construct Eager Tensor diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h index 605019cb607fc..d28ae0ab5d93f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -924,9 +924,11 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( { platform::CUDAPlace place = platform::CUDAPlace(resource_->dev_id(gpu_id)); platform::CUDADeviceGuard guard(resource_->dev_id(gpu_id)); - thrust::device_ptr t_actual_sample_size(actual_sample_size); - int total_sample_size = - thrust::reduce(t_actual_sample_size, t_actual_sample_size + len); + thrust::device_vector t_actual_sample_size(len); + thrust::copy(actual_sample_size, actual_sample_size + len, + t_actual_sample_size.begin()); + int total_sample_size = thrust::reduce(t_actual_sample_size.begin(), + t_actual_sample_size.end()); result.actual_val_mem = memory::AllocShared(place, total_sample_size * sizeof(int64_t)); result.actual_val = (int64_t*)(result.actual_val_mem)->ptr(); @@ -934,7 +936,8 @@ NeighborSampleResult GpuPsGraphTable::graph_neighbor_sample_v2( result.set_total_sample_size(total_sample_size); thrust::device_vector cumsum_actual_sample_size(len); - thrust::exclusive_scan(t_actual_sample_size, t_actual_sample_size + len, + thrust::exclusive_scan(t_actual_sample_size.begin(), + t_actual_sample_size.end(), cumsum_actual_sample_size.begin(), 0); fill_actual_vals<<>>( val, result.actual_val, actual_sample_size, diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index b860ea5d39cb5..e2f362d407458 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -41,9 +41,7 @@ limitations under the License. */ #include "xpu/kernel/simd.h" #endif -#if defined(PADDLE_WITH_XPU_KP) #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" -#endif namespace paddle { namespace framework { @@ -132,10 +130,8 @@ class HashTable { void show(); -#if defined(PADDLE_WITH_XPU_KP) void set_sparse_sgd(const OptimizerConfig& optimizer_config); void set_embedx_sgd(const OptimizerConfig& optimizer_config); -#endif template void dump_to_cpu(int devid, StreamType stream); @@ -178,9 +174,10 @@ class HashTable { TableContainer* container_; #elif defined(PADDLE_WITH_XPU_KP) XPUCacheArray* container_; - OptimizerConfig* xpu_optimizer_config_; - OptimizerConfig cpu_optimizer_config_; #endif + OptimizerConfig* device_optimizer_config_; + OptimizerConfig host_optimizer_config_; + int BLOCK_SIZE_{256}; float LOAD_FACTOR{0.75f}; size_t capacity_; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 87b62c6d380a4..df93f05691771 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -95,6 +95,7 @@ __global__ void dy_mf_search_kernel(Table* table, template __global__ void update_kernel(Table* table, + const OptimizerConfig& optimizer_config, const typename Table::key_type* const keys, const GradType* const grads, size_t len, Sgd sgd) { @@ -102,13 +103,14 @@ __global__ void update_kernel(Table* table, if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { - sgd.update_value((it.getter())->second, grads[i]); + sgd.update_value(optimizer_config, (it.getter())->second, grads[i]); } } } template __global__ void dy_mf_update_kernel(Table* table, + const OptimizerConfig& optimizer_config, const typename Table::key_type* const keys, const char* const grads, size_t len, Sgd sgd, size_t grad_value_size) { @@ -117,7 +119,7 @@ __global__ void dy_mf_update_kernel(Table* table, auto it = table->find(keys[i]); if (it != table->end()) { FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); - sgd.dy_mf_update_value((it.getter())->second, *cur); + sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); } else { printf("yxf::push miss key: %d", keys[i]); } @@ -127,6 +129,9 @@ __global__ void dy_mf_update_kernel(Table* table, template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); + cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig)); + cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_, + sizeof(OptimizerConfig), cudaMemcpyHostToDevice); rwlock_.reset(new phi::RWLock); } @@ -135,6 +140,22 @@ HashTable::~HashTable() { delete container_; } +template +void HashTable::set_sparse_sgd( + const OptimizerConfig& optimizer_config) { + host_optimizer_config_.set_sparse_sgd(optimizer_config); + cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_, + sizeof(OptimizerConfig), cudaMemcpyHostToDevice); +} + +template +void HashTable::set_embedx_sgd( + const OptimizerConfig& optimizer_config) { + host_optimizer_config_.set_embedx_sgd(optimizer_config); + cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_, + sizeof(OptimizerConfig), cudaMemcpyHostToDevice); +} + template void HashTable::show() { container_->print(); @@ -279,8 +300,8 @@ void HashTable::update(const KeyType* d_keys, return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; - update_kernel<<>>(container_, d_keys, - d_grads, len, sgd); + update_kernel<<>>( + container_, *device_optimizer_config_, d_keys, d_grads, len, sgd); } template @@ -293,7 +314,8 @@ void HashTable::update(const KeyType* d_keys, } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; dy_mf_update_kernel<<>>( - container_, d_keys, d_grads, len, sgd, push_grad_value_size_); + container_, *device_optimizer_config_, d_keys, d_grads, len, sgd, + push_grad_value_size_); } template class HashTable; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps index cd43a73b44ec3..79c5f3d757781 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps @@ -163,7 +163,7 @@ __global__ void search_kernel(Table& table, const KeyType* const keys, } template -__global__ void update_kernel(OptimizerConfig& optimizer_config, Table& table, +__global__ void update_kernel(Table& table, OptimizerConfig& optimizer_config, const KeyType* const keys, const GradType* const grads, long long len) { int cid = core_id(); @@ -202,12 +202,9 @@ HashTable::HashTable(size_t capacity) { sizeof(XPUCacheArray)); xpu_memcpy((void*)container_, &tmp_container, sizeof(XPUCacheArray), XPU_HOST_TO_DEVICE); - - OptimizerConfig tmp_opt_config; - xpu_malloc(reinterpret_cast(&xpu_optimizer_config_), + xpu_malloc(reinterpret_cast(&device_optimizer_config_), sizeof(OptimizerConfig)); - - xpu_memcpy((void*)xpu_optimizer_config_, &tmp_opt_config, + xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_, sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE); rwlock_.reset(new phi::RWLock); @@ -216,7 +213,7 @@ HashTable::HashTable(size_t capacity) { template HashTable::~HashTable() { xpu_free((void*)container_); - xpu_free((void*)xpu_optimizer_config_); + xpu_free((void*)device_optimizer_config_); } template @@ -227,28 +224,16 @@ void HashTable::show() { template void HashTable::set_sparse_sgd( const OptimizerConfig& optimizer_config) { - cpu_optimizer_config_.nonclk_coeff = optimizer_config.nonclk_coeff; - cpu_optimizer_config_.clk_coeff = optimizer_config.clk_coeff; - cpu_optimizer_config_.min_bound = optimizer_config.min_bound; - cpu_optimizer_config_.max_bound = optimizer_config.max_bound; - cpu_optimizer_config_.learning_rate = optimizer_config.learning_rate; - cpu_optimizer_config_.initial_g2sum = optimizer_config.initial_g2sum; - cpu_optimizer_config_.initial_range = optimizer_config.initial_range; - xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_, + host_optimizer_config_.set_sparse_sgd(optimizer_config); + xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_, sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE); } template void HashTable::set_embedx_sgd( const OptimizerConfig& optimizer_config) { - cpu_optimizer_config_.mf_create_thresholds = - optimizer_config.mf_create_thresholds; - cpu_optimizer_config_.mf_learning_rate = optimizer_config.mf_learning_rate; - cpu_optimizer_config_.mf_initial_g2sum = optimizer_config.mf_initial_g2sum; - cpu_optimizer_config_.mf_initial_range = optimizer_config.mf_initial_range; - cpu_optimizer_config_.mf_min_bound = optimizer_config.mf_min_bound; - cpu_optimizer_config_.mf_max_bound = optimizer_config.mf_max_bound; - xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_, + host_optimizer_config_.set_embedx_sgd(optimizer_config); + xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_, sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE); } @@ -306,7 +291,7 @@ void HashTable::update(const KeyType* d_keys, long long c_len = (long long)len; update_kernel, GradType><<<4, 64, stream>>>( - *xpu_optimizer_config_, *container_, d_keys, d_grads, c_len); + *container_, *device_optimizer_config_, d_keys, d_grads, c_len); } template diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 6379f7ee91264..e53a962c5abde 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -65,10 +65,8 @@ class HeterComm { void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len); #endif -#if defined(PADDLE_WITH_XPU_KP) void set_sparse_sgd(const OptimizerConfig& optimizer_config); void set_embedx_sgd(const OptimizerConfig& optimizer_config); -#endif int log2i(int x); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 098adc2bdeb88..2a4f535ef701f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -342,7 +342,6 @@ int HeterComm::get_index_by_devid(int devid) { return resource_->get_index_by_devid(devid); } -#if defined(PADDLE_WITH_XPU_KP) template void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { @@ -358,7 +357,6 @@ void HeterComm::set_embedx_sgd( table->set_embedx_sgd(optimizer_config); } } -#endif template void HeterComm::build_ps( diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 581b0d511c23e..66e06b13b046f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -48,6 +48,14 @@ int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } +void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { + comm_->set_sparse_sgd(optimizer_config); +} + +void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { + comm_->set_embedx_sgd(optimizer_config); +} + void HeterPs::end_pass() { comm_->end_pass(); } void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 7060817be91eb..70b88350f2720 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -44,10 +44,8 @@ class HeterPs : public HeterPsBase { int comm_size) override; #endif -#if defined(PADDLE_WITH_XPU_KP) void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; void set_embedx_sgd(const OptimizerConfig& optimizer_config) override; -#endif void end_pass() override; int get_index_by_devid(int devid) override; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 79061ab66af1c..0727e2c2dbce1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -16,9 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -#if defined(PADDLE_WITH_XPU_KP) #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" -#endif #ifdef PADDLE_WITH_HETERPS @@ -48,10 +46,8 @@ class HeterPsBase { virtual void push_sparse(int num, FeatureKey* d_keys, FeaturePushValue* d_grads, size_t len) = 0; -#if defined(PADDLE_WITH_XPU_KP) - virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) {} - virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) {} -#endif + virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; + virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; static HeterPsBase* get_instance(size_t capacity, std::shared_ptr resource); diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index ebf7dd277c7d6..065d5e6d527fc 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -35,58 +35,64 @@ class Optimizer { void initialize() {} - __device__ void update_lr(float& w, float& g2sum, float g, // NOLINT + __device__ void update_lr(const OptimizerConfig& optimizer_config, + float& w, // NOLINT + float& g2sum, float g, // NOLINT float scale) { double add_g2sum = 0; - double ratio = optimizer_config::learning_rate * - sqrt(optimizer_config::initial_g2sum / - (optimizer_config::initial_g2sum + g2sum)); + double ratio = optimizer_config.learning_rate * + sqrt(optimizer_config.initial_g2sum / + (optimizer_config.initial_g2sum + g2sum)); double scaled_grad = g / scale; w += scaled_grad * ratio; - if (w < optimizer_config::min_bound) w = optimizer_config::min_bound; - if (w > optimizer_config::max_bound) w = optimizer_config::max_bound; + if (w < optimizer_config.min_bound) w = optimizer_config.min_bound; + if (w > optimizer_config.max_bound) w = optimizer_config.max_bound; add_g2sum += scaled_grad * scaled_grad; g2sum += add_g2sum; } - __device__ void update_mf(int n, float* w, float& g2sum, // NOLINT + __device__ void update_mf(const OptimizerConfig& optimizer_config, int n, + float* w, + float& g2sum, // NOLINT const float* g, float scale) { double add_g2sum = 0; - double ratio = optimizer_config::mf_learning_rate * - sqrt(optimizer_config::mf_initial_g2sum / - (optimizer_config::mf_initial_g2sum + g2sum)); + double ratio = optimizer_config.mf_learning_rate * + sqrt(optimizer_config.mf_initial_g2sum / + (optimizer_config.mf_initial_g2sum + g2sum)); for (int i = 0; i < n; ++i) { double scaled_grad = g[i] / scale; w[i] += scaled_grad * ratio; - if (w[i] < optimizer_config::mf_min_bound) - w[i] = optimizer_config::mf_min_bound; - if (w[i] > optimizer_config::mf_max_bound) - w[i] = optimizer_config::mf_max_bound; + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; add_g2sum += scaled_grad * scaled_grad; } g2sum += add_g2sum / n; } - __device__ void update_value(ValType& val, const GradType& grad) { // NOLINT + __device__ void update_value(const OptimizerConfig& optimizer_config, + ValType& val, // NOLINT + const GradType& grad) { val.slot = grad.slot; val.show += grad.show; val.clk += grad.clk; - val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + - optimizer_config::clk_coeff * grad.clk; + val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + + optimizer_config.clk_coeff * grad.clk; - update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show); + update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show); if (val.mf_size == 0) { - if (optimizer_config::mf_create_thresholds <= - optimizer_config::nonclk_coeff * (val.show - val.clk) + - optimizer_config::clk_coeff * val.clk) { + if (optimizer_config.mf_create_thresholds <= + optimizer_config.nonclk_coeff * (val.show - val.clk) + + optimizer_config.clk_coeff * val.clk) { val.mf_size = MF_DIM + 1; val.mf[0] = 0; int tid_x = blockIdx.x * blockDim.x + threadIdx.x; @@ -94,30 +100,31 @@ class Optimizer { curand_init(clock64(), tid_x, 0, &state); for (int i = 0; i < MF_DIM; ++i) { val.mf[i + 1] = - (curand_uniform(&state)) * optimizer_config::mf_initial_range; + (curand_uniform(&state)) * optimizer_config.mf_initial_range; } } } else { - update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show); + update_mf(optimizer_config, MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, + grad.show); } } - __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) { + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + ValType* ptr, const GradType& grad) { ptr->slot = grad.slot; ptr->show += grad.show; ptr->clk += grad.clk; - ptr->delta_score += - optimizer_config::nonclk_coeff * (grad.show - grad.clk) + - optimizer_config::clk_coeff * grad.clk; + ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + + optimizer_config.clk_coeff * grad.clk; - update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); + update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); // use MF_DIM temporarily // ptr->mf_dim = grad.mf_dim; if (ptr->mf_size == 0) { - if (optimizer_config::mf_create_thresholds <= - optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config::clk_coeff * ptr->clk) { + if (optimizer_config.mf_create_thresholds <= + optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + + optimizer_config.clk_coeff * ptr->clk) { // ptr->mf_size = ptr->mf_dim + 1; ptr->mf_size = MF_DIM + 1; @@ -127,11 +134,11 @@ class Optimizer { curand_init(clock64(), tid_x, 0, &state); for (int i = 0; i < MF_DIM; ++i) { ptr->mf[i + 1] = - (curand_uniform(&state)) * optimizer_config::mf_initial_range; + (curand_uniform(&state)) * optimizer_config.mf_initial_range; } } } else { - update_mf(MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, + update_mf(optimizer_config, MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, grad.show); // for local test } } diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h index 2a80aa4b52d91..03caeb984f7c9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h @@ -14,50 +14,69 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) +namespace paddle { +namespace framework { -namespace optimizer_config { +class OptimizerConfig { + public: + float nonclk_coeff = 0.1; + float clk_coeff = 1; -__constant__ float nonclk_coeff = 0.1; -__constant__ float clk_coeff = 1; + float min_bound = -10; + float max_bound = 10; + float learning_rate = 0.05; + float initial_g2sum = 3.0; + float initial_range = 0; -__constant__ float min_bound = -10; -__constant__ float max_bound = 10; -__constant__ float learning_rate = 0.05; -__constant__ float initial_g2sum = 3.0; -__constant__ float initial_range = 0; + float mf_create_thresholds = 10; + float mf_learning_rate = 0.05; + float mf_initial_g2sum = 3.0; + float mf_initial_range = 1e-4; + float mf_min_bound = -10; + float mf_max_bound = 10; -__constant__ float mf_create_thresholds = 10; -__constant__ float mf_learning_rate = 0.05; -__constant__ float mf_initial_g2sum = 3.0; -__constant__ float mf_initial_range = 1e-4; -__constant__ float mf_min_bound = -10; -__constant__ float mf_max_bound = 10; -} // namespace optimizer_config + void set_sparse_sgd(float nonclk_coeff, float clk_coeff, float min_bound, + float max_bound, float learning_rate, float initial_g2sum, + float initial_range) { + this->nonclk_coeff = nonclk_coeff; + this->clk_coeff = clk_coeff; + this->min_bound = min_bound; + this->max_bound = max_bound; + this->learning_rate = learning_rate; + this->initial_g2sum = initial_g2sum; + this->initial_range = initial_range; + } -#elif defined(PADDLE_WITH_XPU_KP) -namespace paddle { -namespace framework { + void set_sparse_sgd(const OptimizerConfig& optimizer_config) { + this->nonclk_coeff = optimizer_config.nonclk_coeff; + this->clk_coeff = optimizer_config.clk_coeff; + this->min_bound = optimizer_config.min_bound; + this->max_bound = optimizer_config.max_bound; + this->learning_rate = optimizer_config.learning_rate; + this->initial_g2sum = optimizer_config.initial_g2sum; + this->initial_range = optimizer_config.initial_range; + } -class OptimizerConfig { - public: - float nonclk_coeff; - float clk_coeff; - - float min_bound; - float max_bound; - float learning_rate; - float initial_g2sum; - float initial_range; - - float mf_create_thresholds; - float mf_learning_rate; - float mf_initial_g2sum; - float mf_initial_range; - float mf_min_bound; - float mf_max_bound; + void set_embedx_sgd(float mf_create_thresholds, float mf_learning_rate, + float mf_initial_g2sum, float mf_initial_range, + float mf_min_bound, float mf_max_bound) { + this->mf_create_thresholds = mf_create_thresholds; + this->mf_learning_rate = mf_learning_rate; + this->mf_initial_g2sum = mf_initial_g2sum; + this->mf_initial_range = mf_initial_range; + this->mf_min_bound = mf_min_bound; + this->mf_max_bound = mf_max_bound; + } + + void set_embedx_sgd(const OptimizerConfig& optimizer_config) { + this->mf_create_thresholds = optimizer_config.mf_create_thresholds; + this->mf_learning_rate = optimizer_config.mf_learning_rate; + this->mf_initial_g2sum = optimizer_config.mf_initial_g2sum; + this->mf_initial_range = optimizer_config.mf_initial_range; + this->mf_min_bound = optimizer_config.mf_min_bound; + this->mf_max_bound = optimizer_config.mf_max_bound; + } }; + } // namespace framework } // namespace paddle - -#endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index cf7d98db27e84..3df5a4b473861 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -181,35 +181,21 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, float initial_range) { - cudaMemcpyToSymbol(optimizer_config::nonclk_coeff, &nonclk_coeff, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::clk_coeff, &clk_coeff, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::min_bound, &min_bound, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::max_bound, &max_bound, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::learning_rate, &learning_rate, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::initial_g2sum, &initial_g2sum, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::initial_range, &initial_range, - sizeof(float)); + OptimizerConfig optimizer_config; + optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, + learning_rate, initial_g2sum, initial_range); + HeterPs_->set_sparse_sgd(optimizer_config); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_learning_rate, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, float mf_max_bound) { - cudaMemcpyToSymbol(optimizer_config::mf_create_thresholds, - &mf_create_thresholds, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_learning_rate, &mf_learning_rate, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_initial_g2sum, &mf_initial_g2sum, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_initial_range, &mf_initial_range, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_min_bound, &mf_min_bound, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_max_bound, &mf_max_bound, - sizeof(float)); + OptimizerConfig optimizer_config; + optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, + mf_initial_g2sum, mf_initial_range, + mf_min_bound, mf_max_bound); + HeterPs_->set_embedx_sgd(optimizer_config); } } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index 571a090b9b4a6..28dd873a117dc 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -256,13 +256,8 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float learning_rate, float initial_g2sum, float initial_range) { OptimizerConfig optimizer_config; - optimizer_config.nonclk_coeff = nonclk_coeff; - optimizer_config.clk_coeff = clk_coeff; - optimizer_config.min_bound = min_bound; - optimizer_config.max_bound = max_bound; - optimizer_config.learning_rate = learning_rate; - optimizer_config.initial_g2sum = initial_g2sum; - optimizer_config.initial_range = initial_range; + optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, + learning_rate, initial_g2sum, initial_range); HeterPs_->set_sparse_sgd(optimizer_config); } @@ -271,12 +266,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_initial_range, float mf_min_bound, float mf_max_bound) { OptimizerConfig optimizer_config; - optimizer_config.mf_create_thresholds = mf_create_thresholds; - optimizer_config.mf_learning_rate = mf_learning_rate; - optimizer_config.mf_initial_g2sum = mf_initial_g2sum; - optimizer_config.mf_initial_range = mf_initial_range; - optimizer_config.mf_min_bound = mf_min_bound; - optimizer_config.mf_max_bound = mf_max_bound; + optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, + mf_initial_g2sum, mf_initial_range, + mf_min_bound, mf_max_bound); HeterPs_->set_embedx_sgd(optimizer_config); } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a3b49476d820f..283e79b81e7c6 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -118,6 +118,7 @@ if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn) pass_library(mkldnn_inplace_pass inference DEPS mkldnn_placement_pass op_registry elementwise_add_op gelu_op activation_op softmax_op softmax DIR mkldnn) pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn) + pass_library(conv_affine_channel_mkldnn_fuse_pass inference DIR mkldnn) pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn) pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d7e265fe28bf9..96a1e5c0719dc 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -81,6 +81,7 @@ struct PDNode { bool IsVar() const { return type_ == Type::kVar; } const std::string& name() const { return name_; } + const PDPattern* pdpattern() const { return pattern_; } PDNode& operator=(const PDNode&) = delete; PDNode(const PDNode&) = delete; @@ -277,7 +278,44 @@ class PDPattern { */ class GraphPatternDetector { public: - using subgraph_t = std::map; + struct NodeIdCompare { + bool operator()(Node* node1, Node* node2) const { + return node1->id() < node2->id(); + } + }; + + struct PDNodeCompare { + bool operator()(const PDNode* node1, const PDNode* node2) const { + auto& nodes1 = node1->pdpattern()->nodes(); + auto& nodes2 = node2->pdpattern()->nodes(); + if (nodes1.size() != nodes2.size()) { + return nodes1.size() < nodes2.size(); + } else { + std::string pdnode_hash_key1 = ""; + std::string pdnode_hash_key2 = ""; + for (auto& node : nodes1) { + pdnode_hash_key1 += node.get()->name(); + pdnode_hash_key1 += "#"; + } + pdnode_hash_key1 += node1->name(); + for (auto& node : nodes2) { + pdnode_hash_key2 += node.get()->name(); + pdnode_hash_key2 += "#"; + } + pdnode_hash_key2 += node2->name(); + + auto pdnode_key1 = + std::to_string(std::hash()(pdnode_hash_key1)); + auto pdnode_key2 = + std::to_string(std::hash()(pdnode_hash_key2)); + + return pdnode_key1 < pdnode_key2; + } + return false; + } + }; + + using subgraph_t = std::map; // Operate on the detected pattern. using handle_t = @@ -321,7 +359,8 @@ class GraphPatternDetector { using hit_rcd_t = std::pair; PDPattern pattern_; - std::map> pdnodes2nodes_; + std::map, PDNodeCompare> + pdnodes2nodes_; }; // some helper methods. diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index 7c517a50e9af4..84a14200cb7a5 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { @@ -68,7 +69,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { std::vector weight_decay_values{}; // use map store ? - for (auto* node : graph->Nodes()) { + for (auto* node : TopologySortOperations(*graph)) { if (!node->IsOp()) { continue; } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000..50e751e02dfa0 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.cc @@ -0,0 +1,272 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h" + +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +class Node; + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* Affine Channel inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ + /* Affine channel outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ + +void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, + const ir::Node& ac_scale, + const LoDTensor& ac_bias_tensor, + LoDTensor* eltwise_y_in_tensor) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from AffineChannel + PADDLE_ENFORCE_EQ( + eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(), + platform::errors::InvalidArgument( + "Tensor elementwise y(%d) and activation bias(%d) must have same " + "dimension.", + eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size())); + + auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), + ac_bias_tensor.numel(), 1); + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; + + // Re-compute weight of conv2d from AffineChannel + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1); + auto* weights_data = weights->mutable_data(platform::CPUPlace()); + + EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= scale_array; + + // Check for subnormal values that slows down convolution execution + for (int i = 0; i < weights->numel(); ++i) { + if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0; + } +} + +ConvAffineChannelFusePass::ConvAffineChannelFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("affine_channel")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("data_layout") + .IsStringIn({"NCHW", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + +void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init(name_scope_, graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed."; + return; + } + + VLOG(4) << "handle ConvAffineChannel fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + + auto data_format = conv->Op()->GetAttrIfExists("data_format"); + if (data_format == "AnyLayout") { + LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, " + "it's wrong if data_format of conv is not " + "NCHW."; + } + + // Get affine_channel bias for resizing eltwise_y! + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + // Set shape && datatype manually + eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims())); + eltwise_y_in_desc.SetDataType( + framework::TransToProtoVarType(ac_bias_tensor->dtype())); + eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel()); + eltwise_y_in_desc.SetPersistable(true); + + // Initialize eltwise_y + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({ac_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists("use_mkldnn")); + + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, ac_out); + found_conv_ac_count++; + }; + + gpd(graph, handler); + + AddStatis(found_conv_ac_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_affine_channel_mkldnn_fuse_pass, + paddle::framework::ir::ConvAffineChannelFusePass); + +REGISTER_PASS_CAPABILITY(conv_affine_channel_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("conv2d", 1) + .EQ("affine_channel", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000..075b6d7220316 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_affine_channel_mkldnn_fuse_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and ConvAffineChannel. + */ +class Graph; + +class ConvAffineChannelFusePass : public FusePassBase { + public: + ConvAffineChannelFusePass(); + virtual ~ConvAffineChannelFusePass() {} + + protected: + void ApplyImpl(ir::Graph*) const override; + const std::string name_scope_{"conv_affine_channel_mkldnn_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 9b12870a2bb9b..aec40a5a7ebdd 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -95,8 +95,46 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, return; } +void add_sparse_optimizer( + std::unordered_map& config, // NOLINT + const ::paddle::SparseCommonSGDRuleParameter& sgd_param, + const std::string& prefix = "") { + auto optimizer_name = sgd_param.name(); + if (optimizer_name == "naive") { + config[prefix + "learning_rate"] = sgd_param.naive().learning_rate(); + config[prefix + "initial_range"] = sgd_param.naive().initial_range(); + if (sgd_param.naive().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1]; + } + } else if (optimizer_name == "adagrad") { + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + if (sgd_param.adagrad().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } + } else if (optimizer_name == "std_adagrad") { + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + if (sgd_param.adagrad().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } + } else if (optimizer_name == "adam") { + config[prefix + "learning_rate"] = sgd_param.adam().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adam().initial_range(); + if (sgd_param.adam().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1]; + } + } +} + void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { - // add for hbmps optimizer config + // optimizer config for hbmps auto fleet_desc_str = trainer_desc.fleet_desc(); google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param); auto sparse_table = @@ -105,7 +143,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { auto sparse_table_accessor_parameter = sparse_table_accessor.downpour_accessor_param(); auto accessor_class = sparse_table_accessor.accessor_class(); - // gpups' sparse table optimizer config + // NOTE(zhangminxu): gpups' sparse table optimizer config, // now only support single sparse table // auto sparse_table = param_.sparse_table(0); std::unordered_map config; @@ -126,7 +164,14 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { config["max_bound"] = sparse_table_accessor.sparse_sgd_param().weight_bounds()[1]; } + // NOTE(zhangminxu): for DownpourCtrAccessor & DownpourCtrDoubleAccessor, + // optimizer config for embed_w & embedx_w is the same config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + config["mf_learning_rate"] = config["learning_rate"]; + config["mf_initial_g2sum"] = config["initial_g2sum"]; + config["mf_initial_range"] = config["initial_range"]; + config["mf_min_bound"] = config["min_bound"]; + config["mf_max_bound"] = config["max_bound"]; } else if (accessor_class == "DownpourSparseValueAccessor") { auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name(); if (optimizer_name == "naive") { @@ -186,71 +231,12 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { accessor_class == "DownpourDoubleUnitAccessor") { config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); - auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name(); - if (optimizer_name == "naive") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().naive().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().naive().initial_range(); - if (sparse_table_accessor.embedx_sgd_param() - .naive() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = - sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0]; - config["mf_max_bound"] = - sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1]; - } - } else if (optimizer_name == "adagrad") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); - config["mf_initial_g2sum"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); - if (sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[0]; - config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[1]; - } - } else if (optimizer_name == "std_adagrad") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); - config["mf_initial_g2sum"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); - if (sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[0]; - config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[1]; - } - } else if (optimizer_name == "adam") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().adam().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().adam().initial_range(); - if (sparse_table_accessor.embedx_sgd_param() - .adam() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = - sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0]; - config["mf_max_bound"] = - sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1]; - } - } config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + // optimizer config for embed_w and embedx + add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); + add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(), + "mf_"); } - auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); ps_gpu_wrapper->InitializeGPUServer(config); } diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 5186f8fcc1c51..8ce18d89c9b43 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -180,6 +180,11 @@ void TensorFromArray(const T* src, const size_t& array_size, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_MLU + else if (platform::is_mlu_place(dst_place)) { // NOLINT + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); + } +#endif #ifdef PADDLE_WITH_CUSTOM_DEVICE else if (platform::is_custom_place(dst_place)) { // NOLINT memory::Copy( @@ -247,9 +252,7 @@ void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(dst_place)) { // NOLINT - memory::Copy( - dst_place, dst_ptr, src_place, src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr); } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -448,9 +451,7 @@ inline void TensorToVector(const Tensor& src, #endif #ifdef PADDLE_WITH_MLU else if (platform::is_mlu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, src.place(), src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 2496d4d040e2e..b86b4fec8a571 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -37,7 +37,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/port.h" #ifdef PADDLE_WITH_PSLIB -#include +#include "proto/ps.pb.h" #endif namespace paddle { diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 7fae481f58289..633f481df808b 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -109,7 +109,11 @@ endif() set_target_properties(paddle_inference_shared PROPERTIES OUTPUT_NAME paddle_inference) if(NOT APPLE AND NOT WIN32) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. - set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map") + if (WITH_CUSTOM_DEVICE) + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference_custom_device.map") + else() + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_inference.map") + endif() set_target_properties(paddle_inference_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") # check symbol hidden FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 4827fe6c1ac97..735e1b7be4c1f 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -158,6 +158,19 @@ void AnalysisConfig::EnableNpu(int device_id) { Update(); } +void AnalysisConfig::EnableCustomDevice(const std::string &device_type, + int device_id) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + use_custom_device_ = true; + custom_device_id_ = device_id; + custom_device_type_ = device_type; +#else + LOG(ERROR) << "Please compile with CustomDevice to EnableCustomDevice()"; + use_custom_device_ = false; +#endif + Update(); +} + void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size, bool ipu_enable_pipelining, int ipu_batches_per_step) { @@ -324,6 +337,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // fleet exe related CP_MEMBER(dist_config_); + // custom device related. + CP_MEMBER(use_custom_device_); + CP_MEMBER(custom_device_type_); + CP_MEMBER(custom_device_id_); + if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, false, platform::errors::InvalidArgument( @@ -539,7 +557,8 @@ void AnalysisConfig::Update() { if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu())) || ((use_xpu() ^ pass_builder_->use_xpu())) || ((use_npu() ^ pass_builder_->use_npu())) || - ((use_ipu() ^ pass_builder_->use_ipu()))) { + ((use_ipu() ^ pass_builder_->use_ipu())) || + ((use_custom_device() ^ pass_builder_->use_custom_device()))) { if (use_gpu()) { pass_builder_.reset(new GpuPassStrategy); @@ -562,6 +581,12 @@ void AnalysisConfig::Update() { platform::errors::InvalidArgument( "Only one choice can be made between GPU and NPU.")); pass_builder_.reset(new NpuPassStrategy); + } else if (use_custom_device()) { + PADDLE_ENFORCE_EQ( + use_gpu(), false, + platform::errors::InvalidArgument( + "Only one choice can be made between GPU and CustomDevice.")); + pass_builder_.reset(new CustomDevicePassStrategy); } else { pass_builder_.reset(new CpuPassStrategy); } @@ -588,6 +613,13 @@ void AnalysisConfig::Update() { "Only one choice can be made between GPU and NPU.")); pass_builder_.reset(new NpuPassStrategy( *static_cast(pass_builder_.get()))); + } else if (use_custom_device()) { + PADDLE_ENFORCE_EQ( + use_gpu(), false, + platform::errors::InvalidArgument( + "Only one choice can be made between GPU and CustomDevice.")); + pass_builder_.reset(new CustomDevicePassStrategy( + *static_cast(pass_builder_.get()))); } else { pass_builder_.reset(new CpuPassStrategy( *static_cast(pass_builder_.get()))); @@ -733,7 +765,13 @@ void AnalysisConfig::Update() { "but did not have the option -DWITH_IPU compiled.")); #endif } - + if (use_custom_device_) { +#ifndef PADDLE_WITH_CUSTOM_DEVICE + PADDLE_THROW(platform::errors::Unavailable( + "You tried to enable the custom device " + "but did not have the option -DWITH_CUSTOM_DEVICE compiled.")); +#endif + } if (ir_debug_) { pass_builder()->TurnOnDebug(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4f0d4a908380f..0d3a687c461d1 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -332,6 +332,15 @@ bool AnalysisPredictor::CreateExecutor() { PADDLE_THROW(platform::errors::Unavailable( "You tried to use IPU forward propagation, but Paddle was not compiled " "with WITH_IPU.")); +#endif + } else if (config_.use_custom_device()) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + place_ = paddle::platform::CustomPlace(config_.custom_device_type()); +#else + PADDLE_THROW(platform::errors::Unavailable( + "You tried to use CustomDevice forward propagation, but Paddle was not " + "compiled " + "with WITH_CUSTOM_DEVICE.")); #endif } else { place_ = paddle::platform::CPUPlace(); @@ -1241,6 +1250,12 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( } else if (platform::is_npu_place(place_)) { auto npu_place = place_; res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); + } else if (platform::is_custom_place(place_)) { + auto custom_place = place_; + auto paddleplace = static_cast( + static_cast(PaddlePlace::kCUSTOM) + + phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); + res->SetPlace(paddleplace, custom_place.GetDeviceId()); } else { auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); @@ -1290,6 +1305,12 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } else if (platform::is_npu_place(place_)) { auto npu_place = place_; res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId()); + } else if (platform::is_custom_place(place_)) { + auto custom_place = place_; + auto paddleplace = static_cast( + static_cast(PaddlePlace::kCUSTOM) + + phi::GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType())); + res->SetPlace(paddleplace, custom_place.GetDeviceId()); } else { auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 0c68acfe98047..bb966dc5c6c1b 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -224,8 +224,23 @@ void Tensor::CopyFromCpu(const T *data) { "with NPU.")); #endif } else { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + auto device_type_id = + static_cast(place_) - static_cast(PlaceType::kCUSTOM); + paddle::platform::DeviceContextPool &pool = + paddle::platform::DeviceContextPool::Instance(); + paddle::platform::CustomPlace custom_place( + phi::GetGlobalDeviceType(device_type_id), device_); + auto *t_data = tensor->mutable_data(custom_place); + auto *dev_ctx = static_cast( + pool.Get(custom_place)); + paddle::memory::Copy(custom_place, static_cast(t_data), + paddle::platform::CPUPlace(), data, ele_size, + dev_ctx->stream()); +#else PADDLE_THROW(paddle::platform::errors::InvalidArgument( "The analysis predictor supports CPU, GPU, NPU and XPU now.")); +#endif } } @@ -398,8 +413,20 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, "with NPU.")); #endif } else { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + paddle::platform::DeviceContextPool &pool = + paddle::platform::DeviceContextPool::Instance(); + auto custom_place = t_place; + auto *dev_ctx = static_cast( + pool.Get(custom_place)); + paddle::memory::Copy(paddle::platform::CPUPlace(), + static_cast(data), custom_place, t_data, + ele_num * sizeof(T), dev_ctx->stream()); +// TODO(wangran16): sync_stream +#else PADDLE_THROW(paddle::platform::errors::InvalidArgument( "The analysis predictor supports CPU, GPU, NPU and XPU now.")); +#endif } } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 9c48d822b4d0d..8edbc494ab886 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -332,6 +332,14 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableNpu(int device_id = 0); /// + /// \brief Turn on CustomDevice. + /// + /// \param device_type device_type the custom device to use. + /// + /// \param device_id device_id the custom device to use (default is 0). + /// + void EnableCustomDevice(const std::string& device_type, int device_id); + /// /// \brief Turn on ONNXRuntime. /// void EnableONNXRuntime(); @@ -366,6 +374,11 @@ struct PD_INFER_DECL AnalysisConfig { /// \return bool Whether the IPU is turned on. /// bool use_ipu() const { return use_ipu_; } + /// \brief A boolean state telling whether the CustomDevice is turned on. + /// + /// \return bool Whether the CustomDevice is turned on. + /// + bool use_custom_device() const { return use_custom_device_; } /// /// \brief A boolean state telling whether the ONNXRuntime is turned on. /// @@ -403,6 +416,17 @@ struct PD_INFER_DECL AnalysisConfig { /// int ipu_device_num() const { return ipu_device_num_; } /// + /// \brief Get the custom device id. + /// + /// \return int The custom device id. + /// + int custom_device_id() const { return custom_device_id_; } + /// \brief Get the custom device type. + /// + /// \return string The custom device type. + /// + std::string custom_device_type() const { return custom_device_type_; } + /// /// \brief Get the initial size in MB of the GPU memory pool. /// /// \return int The initial size in MB of the GPU memory pool. @@ -900,6 +924,11 @@ struct PD_INFER_DECL AnalysisConfig { bool use_npu_{false}; int npu_device_id_{0}; + // CustomDevice related + bool use_custom_device_{false}; + int custom_device_id_{0}; + std::string custom_device_type_; + // ONNXRuntime related bool use_onnxruntime_{false}; bool enable_ort_optimization_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 01988d5f539dc..f59494628ad7e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -282,7 +282,8 @@ void CpuPassStrategy::EnableMKLDNN() { "depthwise_conv_mkldnn_pass", // "conv_bn_fuse_pass", // Execute BN passes again to "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order - "conv_transpose_bn_fuse_pass", // + "conv_affine_channel_mkldnn_fuse_pass", // + "conv_transpose_bn_fuse_pass", // "conv_transpose_eltwiseadd_bn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", // "conv_transpose_bias_mkldnn_fuse_pass", diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index db6bde62ddc7c..f01799c646077 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -166,6 +166,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in ipu mode. bool use_ipu() const { return use_ipu_; } + /// \brief Check if we are using CustomDevice. + /// \return A bool variable implying whether we are in CustomDevice mode. + bool use_custom_device() const { return use_custom_device_; } + /// \brief Default destructor. virtual ~PassStrategy() = default; @@ -177,6 +181,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { bool use_ipu_{false}; bool use_mkldnn_{false}; bool use_gpu_fp16_{false}; + bool use_custom_device_{false}; /// \endcond }; @@ -291,6 +296,22 @@ class PD_INFER_DECL NpuPassStrategy final : public PassStrategy { } }; +/// \class CustomDevicePassStrategy +/// \brief The CustomDevice passes controller, it is used in AnalysisPredictor +/// with CustomDevice +/// mode. +class PD_INFER_DECL CustomDevicePassStrategy final : public PassStrategy { + public: + CustomDevicePassStrategy() : PassStrategy({}) { use_custom_device_ = true; } + + /// \brief Construct by copying another CustomDevicePassStrategy object. + /// \param[in] other The CustomDevicePassStrategy object we want to copy. + explicit CustomDevicePassStrategy(const CustomDevicePassStrategy &other) + : PassStrategy(other.AllPasses()) { + use_custom_device_ = true; + } +}; + /// \class IpuPassStrategy /// \brief The IPU passes controller, it is used in AnalysisPredictor with IPU /// mode. diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 3cd2df3aef639..11086b369fc15 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -54,7 +54,7 @@ enum DataType { // TODO(Superjomn) support more data types if needed. }; -enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU }; +enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU, kCUSTOM }; enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW }; diff --git a/paddle/fluid/inference/paddle_inference.map b/paddle/fluid/inference/paddle_inference.map index 5bb9b8d75620b..05935701635d9 100644 --- a/paddle/fluid/inference/paddle_inference.map +++ b/paddle/fluid/inference/paddle_inference.map @@ -6,4 +6,3 @@ local: *; }; - diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map new file mode 100644 index 0000000000000..52bc2870482e2 --- /dev/null +++ b/paddle/fluid/inference/paddle_inference_custom_device.map @@ -0,0 +1,10 @@ +{ + global: + *paddle*; + *Pass*; + *profile*; + *phi*; + *FLAGS_*; + local: + *; +}; diff --git a/paddle/fluid/operators/assign_op_mlu.cc b/paddle/fluid/operators/assign_op_mlu.cc new file mode 100644 index 0000000000000..85092c516955d --- /dev/null +++ b/paddle/fluid/operators/assign_op_mlu.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/assign_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { +template +class AssignMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc x_desc(*x); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Assign(ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), + GetBasePtr(out)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(assign, ops::AssignMLUKernel, + ops::AssignMLUKernel, + ops::AssignMLUKernel, + ops::AssignMLUKernel) diff --git a/paddle/fluid/operators/assign_value_op_mlu.cc b/paddle/fluid/operators/assign_value_op_mlu.cc new file mode 100644 index 0000000000000..651e129ccb17a --- /dev/null +++ b/paddle/fluid/operators/assign_value_op_mlu.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/assign_value_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel, + ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index bc6cf9d831ff0..76e0f23df2168 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -156,7 +156,7 @@ REGISTER_OP_CPU_KERNEL( ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel, + ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel>, diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc index 96b27a833fba3..7a2a802382f6c 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc @@ -122,4 +122,5 @@ REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel, + ops::RecvOpV2CUDAKernel, ops::RecvOpV2CUDAKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc index add352306fa28..57a3fe2e45d7e 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -109,4 +109,5 @@ REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel, + ops::SendOpV2CUDAKernel, ops::SendOpV2CUDAKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc index 2d7382f3dfd70..882630467a012 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -41,7 +41,6 @@ class CSendOpASCENDKernel : public framework::OpKernel { // Use ProcessGroup distributed::ProcessGroup* pg = map->get(ring_id); std::vector in_tensor; - auto x = ctx.Input("X"); in_tensor.push_back(*x); auto task = pg->Send(in_tensor, 1); return; diff --git a/paddle/fluid/operators/compat/fill_constant.pbtxt b/paddle/fluid/operators/compat/fill_constant.pbtxt index 26fecf623c19c..62701eeb396da 100644 --- a/paddle/fluid/operators/compat/fill_constant.pbtxt +++ b/paddle/fluid/operators/compat/fill_constant.pbtxt @@ -58,4 +58,8 @@ extra { name: "op_device" type: STRING } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } } diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index caad1b169a676..448f67a4bad7a 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -39,10 +39,10 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; +using paddle::platform::PADDLE_CUDA_NUM_THREADS; -const int CUDA_NUM_THREADS = 1024; static inline int GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; + return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; } template @@ -252,8 +252,8 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { T* top_data = out->mutable_data(ctx.GetPlace()); T* top_count_data = top_count->mutable_data(ctx.GetPlace()); - DeformablePSROIPoolForwardKernel<<>>( + DeformablePSROIPoolForwardKernel<<< + GET_BLOCKS(count), PADDLE_CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>( count, bottom_data, (T)spatial_scale, channels, height, width, pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, (T)trans_std, sample_per_part, output_dim, group_height, group_width, @@ -344,6 +344,19 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( gw = min(max(gw, 0), group_width - 1); gh = min(max(gh, 0), group_height - 1); + int c = (ctop * group_height + gh) * group_width + gw; + int bottom_index_base = c * height * width; + int bottom_index = + roi_batch_ind * channels * height * width + bottom_index_base; + int trans_index_x = + (((n * num_classes + class_id) * 2) * part_height + part_h) * + part_width + + part_w; + int trans_index_y = + (((n * num_classes + class_id) * 2 + 1) * part_height + part_h) * + part_width + + part_w; + // sampling in each bin for (int ih = 0; ih < sample_per_part; ih++) { for (int iw = 0; iw < sample_per_part; iw++) { @@ -354,7 +367,6 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( } w = min(max(w, 0.), width - 1.); h = min(max(h, 0.), height - 1.); - int c = (ctop * group_height + gh) * group_width + gw; int x0 = floor(w); int x1 = ceil(w); int y0 = floor(h); @@ -366,25 +378,20 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( T q01 = (1 - dist_x) * dist_y; T q10 = dist_x * (1 - dist_y); T q11 = dist_x * dist_y; - int bottom_index_base = c * height * width; // compute gradient of input if (bottom_data_diff) { platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y0 * width + x0, + bottom_data_diff + bottom_index + y0 * width + x0, q00 * diff_val); platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y1 * width + x0, + bottom_data_diff + bottom_index + y1 * width + x0, q01 * diff_val); platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y0 * width + x1, + bottom_data_diff + bottom_index + y0 * width + x1, q10 * diff_val); platform::CudaAtomicAdd( - bottom_data_diff + roi_batch_ind * channels * height * width + - bottom_index_base + y1 * width + x1, + bottom_data_diff + bottom_index + y1 * width + x1, q11 * diff_val); } @@ -405,19 +412,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( u00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; - platform::CudaAtomicAdd( - bottom_trans_diff + - (((n * num_classes + class_id) * 2) * part_height + part_h) * - part_width + - part_w, - diff_x); - platform::CudaAtomicAdd( - bottom_trans_diff + - (((n * num_classes + class_id) * 2 + 1) * part_height + - part_h) * - part_width + - part_w, - diff_y); + platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); + platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); } } } @@ -520,8 +516,8 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); - DeformablePSROIPoolBackwardAccKernel<<>>( + DeformablePSROIPoolBackwardAccKernel<<< + GET_BLOCKS(count), PADDLE_CUDA_NUM_THREADS, 0, dev_ctx.stream()>>>( count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels, height, width, pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc index b88974a51ceff..f4dbbae05532e 100644 --- a/paddle/fluid/operators/dropout_op_mlu.cc +++ b/paddle/fluid/operators/dropout_op_mlu.cc @@ -82,7 +82,7 @@ class DropoutMLUKernel : public framework::OpKernel { *x, ctx.GetPlace(), ctx.template device_context(), out); } else { - float scale = static_cast(1.0f - dropout_prob); + auto scale = static_cast(1.0f - dropout_prob); Tensor scale_tensor(x->dtype()); scale_tensor.mutable_data({1}, ctx.GetPlace()); MLUCnnlTensorDesc scale_desc(scale_tensor); diff --git a/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc new file mode 100644 index 0000000000000..e003a43b5c56b --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_heaviside_op.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +class ElementwiseHeavisideOpMaker : public ElementwiseOpMaker { + protected: + std::string GetName() const override { return "Heaviside"; } + std::string GetEquation() const override { return "Out = Heaviside(X, Y)"; } + + void AddInputX() override { + AddInput("X", + "(Tensor), The input tensor of Heaviside step function. " + "Its dtype can be int32, int64, float32 and float64"); + } + + void AddInputY() override { + AddInput("Y", + "(Tensor), The tensor determining a Heaviside step function, " + "which is the value when X = 0. Its dtype should be same as X."); + } + + std::string GetOpFuntionality() const override { + return "Computes the Heaviside step function determined by Y " + "for each element in X."; + } +}; + +template +class ElementwiseHeavisideGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("elementwise_heaviside_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("Y", this->Input("Y")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + elementwise_heaviside, ops::ElementwiseOp, ops::ElementwiseHeavisideOpMaker, + ops::ElementwiseHeavisideGradOpMaker, + ops::ElementwiseHeavisideGradOpMaker); + +REGISTER_OPERATOR(elementwise_heaviside_grad, ops::ElementwiseOpGrad); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc new file mode 100644 index 0000000000000..156589384c0dd --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op_xpu.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" + +#include "paddle/fluid/operators/elementwise/elementwise_xpu.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +template +class ElementwiseModXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUElementwise(ctx, xpu::broadcast_mod); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL(elementwise_mod, ops::ElementwiseModXPUKernel, + ops::ElementwiseModXPUKernel, + ops::ElementwiseModXPUKernel, + ops::ElementwiseModXPUKernel); + +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index adc0842fb3882..95753bb336354 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -103,11 +103,12 @@ class ElementwiseOp : public framework::OperatorWithKernel { std::vector out_dims_array(max_dim); #ifdef PADDLE_WITH_MKLDNN // (jczaja): Broadcasting of dims has to be done on Paddle shapes (NHWC) - // if model is using NHWC. + // if model is using NHWC and any of shapes in at least 3D bool should_rotate = ctx->IsRunMKLDNNKernel() && (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() == - framework::DataLayout::kNHWC); + framework::DataLayout::kNHWC) && + (x_dims.size() >= 3 || y_dims.size() >= 3); if (should_rotate) { // Pick bigger shape and rotate this one bool x_over_y = (x_dims.size() > y_dims.size()); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index ca46a3db1ecd5..2e924da283ab3 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -22,10 +22,10 @@ class FillConstantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FillConstant"); - auto& shape = ctx->Attrs().Get>("shape"); + auto &shape = ctx->Attrs().Get>("shape"); if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { for (size_t i = 0; i < shape.size(); ++i) { PADDLE_ENFORCE_GE( @@ -52,8 +52,8 @@ class FillConstantOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const framework::Tensor& tensor, - const framework::OpKernelType& expected_kernel_type) const override { + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") { return expected_kernel_type; } else { @@ -63,7 +63,7 @@ class FillConstantOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( framework::proto::VarType::Type(ctx.Attr("dtype")), ctx.GetPlace()); @@ -97,13 +97,24 @@ class FillConstantOp : public framework::OperatorWithKernel { } } +#ifdef PADDLE_WITH_MKLDNN + auto input_data_type = + framework::proto::VarType::Type(ctx.Attr("dtype")); + + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return kt; } }; class FillConstantOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(framework::InferVarTypeContext* ctx) const override { + void operator()(framework::InferVarTypeContext *ctx) const override { auto data_type = static_cast( BOOST_GET_CONST(int, ctx->GetAttr("dtype"))); ctx->SetOutputDataType("Out", data_type); @@ -156,6 +167,10 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { "3: XPUPlace. " "4: NPUPlace. ") .SetDefault(-1); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false) + .AsExtra(); AddOutput("Out", "(Tensor) Tensor of specified shape will be filled " "with the specified value"); diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 79fcb780feb93..ab8829b7baf5f 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -33,5 +33,6 @@ USE_JITKERNEL_GEN(kHMax) USE_JITKERNEL_GEN(kHSum) USE_JITKERNEL_GEN(kEmbSeqPool) USE_JITKERNEL_GEN(kAdam) +USE_JITKERNEL_GEN(kAdamW) USE_JITKERNEL_GEN(kSgd) USE_JITKERNEL_GEN(kVBroadcast) diff --git a/paddle/fluid/operators/jit/gen/adam.cc b/paddle/fluid/operators/jit/gen/adam.cc index 7e8cb7f59eed6..38ef6772f01ad 100644 --- a/paddle/fluid/operators/jit/gen/adam.cc +++ b/paddle/fluid/operators/jit/gen/adam.cc @@ -80,7 +80,7 @@ void AdamJitCode::mainCode() { // beta2 * mom2 + (1 - beta2) * g * g vmulps(ymm7 | k1, ymm7, ymm7); vmulps(ymm7 | k1, ymm_one_sub_beta2, ymm7); - vfmadd231ps(ymm7 | k1, ymm1, ptr[reg_mom2_ptr + reg_offset]); + vfmadd231ps(ymm7 | k1, ymm_beta2, ptr[reg_mom2_ptr + reg_offset]); // store mom1 and mom2 vmovups(ptr[reg_mom1_out_ptr + reg_offset] | k1, ymm8); @@ -88,11 +88,11 @@ void AdamJitCode::mainCode() { // sqrt(mom2) + eps vsqrtps(ymm7 | k1, ymm7); - vaddps(ymm7 | k1, ymm7, ymm3); + vaddps(ymm7 | k1, ymm7, ymm_eps); // p + (-lr) * (mom1 / sqrt(mom2) + eps) vdivps(ymm7 | k1, ymm8, ymm7); - vfmadd213ps(ymm7 | k1, ymm2, ptr[reg_param_ptr + reg_offset]); + vfmadd213ps(ymm7 | k1, ymm_lr, ptr[reg_param_ptr + reg_offset]); // store p vmovups(ptr[reg_param_out_ptr + reg_offset] | k1, ymm7); diff --git a/paddle/fluid/operators/jit/gen/adamw.cc b/paddle/fluid/operators/jit/gen/adamw.cc new file mode 100644 index 0000000000000..b470143fb7d8d --- /dev/null +++ b/paddle/fluid/operators/jit/gen/adamw.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/adamw.h" + +#include // offsetof + +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void AdamWJitCode::loadArgs() { + static constexpr int32_t one_as_float = 0x3f800000; + static constexpr int32_t mask_all_ones = 0xFFFFFFFF; + static constexpr int64_t mask_8_divisible = 0xFFFFFFFFFFFFFFF8; + static constexpr int64_t abi_pushes_offset = num_g_abi_regs * 8; + + mov(reg_mom2_out_ptr, ptr[rsp + (abi_pushes_offset + 8)]); + mov(reg_param_out_ptr, ptr[rsp + (abi_pushes_offset + 16)]); + mov(eax, one_as_float); + movd(xmm_one, eax); + + vbroadcastss(ymm_one, xmm_one); // 1 + vbroadcastss(ymm_beta1, xmm_beta1); // beta1 + vbroadcastss(ymm_beta2, xmm_beta2); // beta2 + vbroadcastss(ymm_lr, xmm_lr); // -lr + vbroadcastss(ymm_eps, xmm_eps); // eps + vbroadcastss(ymm_old_lr, xmm_old_lr); // old lr + vbroadcastss(ymm_lr_ratio, xmm_lr_ratio); // lr_ratio + vbroadcastss(ymm_coeff, xmm_coeff); // coeff + vsubps(ymm_one_sub_beta1, ymm_one, ymm_beta1); // 1 - beta1 + vsubps(ymm_one_sub_beta2, ymm_one, ymm_beta2); // 1 - beta2 + + mov(reg_numel_without_tail, reg_numel); + and_(reg_numel_without_tail, mask_8_divisible); // make it 8-divisible + + shl(reg_numel_without_tail, 2); // * 4 to treat it as float offset + shl(reg_numel, 2); + + mov(eax, mask_all_ones); + kmovw(k1, eax); + + xor_(reg_offset, reg_offset); +} + +void AdamWJitCode::setTailOpmask() { + mov(r13, rcx); + + mov(rcx, reg_numel); + sub(rcx, reg_offset); // get tail numel as float size + shr(rcx, 2); // as elements + mov(r14, 1); + shl(r14, cl); // 2 ^ elements + dec(r14); // 2 ^ elements - 1, so numel first bits are set to 1 + kmovw(k1, r14d); + + mov(rcx, r13); +} + +void AdamWJitCode::mainCode() { + // load p + vmovups(ymm10 | k1, ptr[reg_param_ptr + reg_offset]); + + // ((lr * lr_ratio) * coeff) + vmulps(ymm11 | k1, ymm_old_lr, ymm_lr_ratio); + vmulps(ymm11 | k1, ymm11, ymm_coeff); + + // - (lr * lr_ratio) * coeff) * p + p + // p is stored in ymm11 + vfnmadd132ps(ymm11 | k1, ymm10, ymm10); + + // load grad + vmovups(ymm10 | k1, ptr[reg_grad_ptr + reg_offset]); + + // beta1 * mom1 + (1 - beta1) * g + vmulps(ymm12 | k1, ymm_one_sub_beta1, ymm10); + vfmadd231ps(ymm12 | k1, ymm_beta1, ptr[reg_mom1_ptr + reg_offset]); + + // beta2 * mom2 + (1 - beta2) * g * g + vmulps(ymm10 | k1, ymm10, ymm10); + vmulps(ymm10 | k1, ymm_one_sub_beta2, ymm10); + vfmadd231ps(ymm10 | k1, ymm_beta2, ptr[reg_mom2_ptr + reg_offset]); + + // store mom1 and mom2 + vmovups(ptr[reg_mom1_out_ptr + reg_offset] | k1, ymm12); + vmovups(ptr[reg_mom2_out_ptr + reg_offset] | k1, ymm10); + + // sqrt(mom2) + eps + vsqrtps(ymm10 | k1, ymm10); + vaddps(ymm10 | k1, ymm10, ymm_eps); + + // p + (-lr) * (mom1 / sqrt(mom2) + eps) + vdivps(ymm10 | k1, ymm12, ymm10); + vfmadd213ps(ymm10 | k1, ymm_lr, ymm11); + + // store p + vmovups(ptr[reg_param_out_ptr + reg_offset] | k1, ymm10); +} + +void AdamWJitCode::genCode() { + static constexpr int64_t main_loop_elems_size = + 8 * sizeof(float); // 8 floats in YMM + static constexpr int64_t offset_increment = main_loop_elems_size; + preCode(); + loadArgs(); + + cmp(reg_numel, main_loop_elems_size); + jl("process_tail"); + + L("main_loop"); + { + mainCode(); + add(reg_offset, offset_increment); + cmp(reg_numel_without_tail, reg_offset); + jg("main_loop"); + } + + cmp(reg_numel, reg_offset); + je("end", T_NEAR); // size between jmp and label is larger than 127 byte, + // T_NEAR allow long jump + + L("process_tail"); + { + setTailOpmask(); + mainCode(); + } + + L("end"); + postCode(); +} + +class AdamWCreator : public JitCodeCreator { + public: + bool CanBeUsed(const int& attr) const override { + return platform::MayIUse(platform::avx512f); + } + size_t CodeSize(const int& attr) const override { return 96 + 32 * 8; } + std::unique_ptr CreateJitCode(const int& attr) const override { + return make_unique(attr, CodeSize(attr)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kAdamW, gen::AdamWCreator); diff --git a/paddle/fluid/operators/jit/gen/adamw.h b/paddle/fluid/operators/jit/gen/adamw.h new file mode 100644 index 0000000000000..759dcd62c8256 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/adamw.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include + +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class AdamWJitCode : public JitCode { + public: + explicit AdamWJitCode(const int& attr, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr) { + this->genCode(); + } + + DECLARE_JIT_CODE(AdamJitCode); + void genCode() override; + void loadArgs(); + void setTailOpmask(); + void mainCode(); + + private: + reg64_t reg_numel{abi_param1}; + reg64_t reg_grad_ptr{abi_param2}; + reg64_t reg_mom1_ptr{abi_param3}; + reg64_t reg_mom2_ptr{abi_param4}; + reg64_t reg_param_ptr{abi_param5}; + reg64_t reg_mom1_out_ptr{abi_param6}; + + xmm_t xmm_beta1 = xmm_t(0); + xmm_t xmm_beta2 = xmm_t(1); + xmm_t xmm_lr = xmm_t(2); + xmm_t xmm_eps = xmm_t(3); + xmm_t xmm_old_lr = xmm_t(4); + xmm_t xmm_lr_ratio = xmm_t(5); + xmm_t xmm_coeff = xmm_t(6); + xmm_t xmm_one_sub_beta1 = xmm_t(7); + xmm_t xmm_one_sub_beta2 = xmm_t(8); + xmm_t xmm_one = xmm_t(9); + + ymm_t ymm_beta1 = ymm_t(0); + ymm_t ymm_beta2 = ymm_t(1); + ymm_t ymm_lr = ymm_t(2); + ymm_t ymm_eps = ymm_t(3); + ymm_t ymm_old_lr = ymm_t(4); + ymm_t ymm_lr_ratio = ymm_t(5); + ymm_t ymm_coeff = ymm_t(6); + ymm_t ymm_one_sub_beta1 = ymm_t(7); + ymm_t ymm_one_sub_beta2 = ymm_t(8); + ymm_t ymm_one = ymm_t(9); + + reg64_t reg_mom2_out_ptr{r10}; + reg64_t reg_param_out_ptr{r11}; + reg64_t reg_numel_without_tail{r12}; + reg64_t reg_offset{rax}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 4bdb65030590f..46da6fba2e98a 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -59,6 +59,7 @@ const char* to_string(KernelType kt) { ONE_CASE(kMatMul); ONE_CASE(kHMax); ONE_CASE(kAdam); + ONE_CASE(kAdamW); ONE_CASE(kHSum); ONE_CASE(kStrideASum); ONE_CASE(kSoftmax); diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index 40ea04d3c2791..9a48d9c3c8d6c 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -25,6 +25,7 @@ typedef enum { kNone = 0, // sort by alphabet kAdam = 1, + kAdamW, kCRFDecoding, kEmbSeqPool, kGRUH1, @@ -285,6 +286,15 @@ struct AdamTuple { const T*, T*, T*, T*); }; +template +struct AdamWTuple { + static constexpr KernelType kernel_type = kAdamW; + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(T, T, T, T, T, T, T, int64_t, const T*, const T*, + const T*, const T*, T*, T*, T*); +}; + typedef struct matmul_attr_s { int m, n, k; void* packed_weight{nullptr}; diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index e4e3263e01eba..a1ee4508f7241 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -37,5 +37,6 @@ USE_JITKERNEL_REFER(kStrideASum) USE_JITKERNEL_REFER(kSoftmax) USE_JITKERNEL_REFER(kEmbSeqPool) USE_JITKERNEL_REFER(kAdam) +USE_JITKERNEL_REFER(kAdamW) USE_JITKERNEL_REFER(kSgd) USE_JITKERNEL_REFER(kVBroadcast) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 8669bfe37232b..779d4c172b83c 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -56,6 +56,7 @@ REGISTER_REFER_KERNEL(StrideASum); REGISTER_REFER_KERNEL(Softmax); REGISTER_REFER_KERNEL(EmbSeqPool); REGISTER_REFER_KERNEL(Adam); +REGISTER_REFER_KERNEL(AdamW); REGISTER_REFER_KERNEL(Sgd); REGISTER_REFER_KERNEL(VBroadcast); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 3545b35a703f8..79b2e174efc16 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -565,6 +565,21 @@ void Adam(T beta1, T beta2, T lr, T eps, int64_t numel, const T* grad_ptr, } } +template +void AdamW(T beta1, T beta2, T lr, T eps, T old_lr, T lr_ratio, T coeff, + int64_t numel, const T* grad_ptr, const T* mom1_ptr, + const T* mom2_ptr, const T* param_ptr, T* mom1_out_ptr, + T* mom2_out_ptr, T* param_out_ptr) { + for (int i = 0; i < numel; ++i) { + auto param_tmp = param_ptr[i] - old_lr * lr_ratio * coeff * param_ptr[i]; + mom1_out_ptr[i] = beta1 * mom1_ptr[i] + (1 - beta1) * grad_ptr[i]; + mom2_out_ptr[i] = + beta2 * mom2_ptr[i] + (1 - beta2) * grad_ptr[i] * grad_ptr[i]; + param_out_ptr[i] = + param_tmp + lr * (mom1_out_ptr[i] / (sqrt(mom2_out_ptr[i]) + eps)); + } +} + #define DECLARE_REFER_KERNEL(name) \ template \ class name##Kernel : public ReferKernel> { \ @@ -617,6 +632,7 @@ DECLARE_REFER_KERNEL(MatMul); DECLARE_REFER_KERNEL(Softmax); DECLARE_REFER_KERNEL(EmbSeqPool); DECLARE_REFER_KERNEL(Adam); +DECLARE_REFER_KERNEL(AdamW); DECLARE_REFER_KERNEL(Sgd); DECLARE_REFER_KERNEL(VBroadcast); diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 675db4a72bda3..74f2d62c64da9 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -907,6 +907,73 @@ void TestKernelAdam() { param, mom1_out, mom2_out, param_out); } +template +void TestKernelAdamW() { + using T = typename KernelTuple::data_type; + VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type); + const T old_lr = 0.1; + const T beta1 = 0.99; + const T beta2 = 0.95; + const T beta1_pow = beta1 * beta1; + const T beta2_pow = beta2 * beta2; + + const T epsilon = 0.000001; + const int64_t numel = 123; + const T lr_ratio = 0.2; + const T coeff = 0.3; + + T learning_rate = old_lr * (sqrt(1 - beta2_pow) / (1 - beta1_pow)); + T eps = epsilon * sqrt(1 - beta2_pow); + + std::vector param(numel); + std::vector grad(numel); + std::vector mom1(numel); + std::vector mom2(numel); + + std::vector param_out(param.size()); + std::vector mom1_out(mom1.size()); + std::vector mom2_out(mom2.size()); + + RandomVec(numel, param.data(), 0.5f); + RandomVec(numel, grad.data(), 0.5f); + RandomVec(numel, mom1.data(), 0.5f); + RandomVec(numel, mom2.data(), 0.5f); + auto ref = jit::GetReferFunc(); + EXPECT_TRUE(ref != nullptr); + ref(beta1, beta2, -learning_rate, eps, old_lr, lr_ratio, coeff, numel, + grad.data(), mom1.data(), mom2.data(), param.data(), mom1_out.data(), + mom2_out.data(), param_out.data()); + + auto verifier = []( + const typename KernelTuple::func_type tgt, T beta1, T beta2, T lr, T eps, + T old_lr, T lr_ratio, T coeff, int64_t numel, const std::vector& grad, + const std::vector& mom1, const std::vector& mom2, + const std::vector& param, const std::vector& ref_mom1_out, + const std::vector& ref_mom2_out, const std::vector& ref_param_out) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(param.size(), static_cast(numel)); + EXPECT_EQ(grad.size(), static_cast(numel)); + EXPECT_EQ(mom1.size(), static_cast(numel)); + EXPECT_EQ(mom2.size(), static_cast(numel)); + + std::vector jit_mom1_out(ref_mom1_out.size()); + std::vector jit_mom2_out(ref_mom2_out.size()); + std::vector jit_param_out(ref_param_out.size()); + + tgt(beta1, beta2, -lr, eps, old_lr, lr_ratio, coeff, numel, grad.data(), + mom1.data(), mom2.data(), param.data(), jit_mom1_out.data(), + jit_mom2_out.data(), jit_param_out.data()); + + ExpectEQ(ref_mom1_out.data(), jit_mom1_out.data(), numel); + ExpectEQ(ref_mom2_out.data(), jit_mom2_out.data(), numel); + ExpectEQ(ref_param_out.data(), jit_param_out.data(), numel); + }; + + TestAllImpls( + 1, verifier, beta1, beta2, learning_rate, eps, old_lr, lr_ratio, coeff, + numel, grad, mom1, mom2, param, mom1_out, mom2_out, param_out); +} + template void TestKernelSgd() { using T = typename KernelTuple::data_type; @@ -1046,7 +1113,7 @@ TEST(JITKernel_pool, jitcreator) { #if defined(_WIN32) || defined(__APPLE__) || defined(__OSX__) EXPECT_EQ(jitcreators.size(), 0UL); #else - EXPECT_EQ(jitcreators.size(), 26UL); + EXPECT_EQ(jitcreators.size(), 27UL); #endif } @@ -1080,7 +1147,7 @@ TEST(JITKernel_pool, more) { TEST(JITKernel_pool, refer) { const auto& kers = jit::ReferKernelPool::Instance().AllKernels(); - EXPECT_EQ(kers.size(), 32UL); + EXPECT_EQ(kers.size(), 33UL); } // test helper @@ -1464,6 +1531,7 @@ TEST_CPU_KERNEL(EmbSeqPool); TEST_CPU_KERNEL(MatMul); TEST_CPU_KERNEL(Softmax); TEST_CPU_KERNEL(Adam); +TEST_CPU_KERNEL(AdamW); TEST_CPU_KERNEL(Sgd); TEST_CPU_KERNEL(VBroadcast); diff --git a/paddle/fluid/operators/layer_norm_op_mlu.cc b/paddle/fluid/operators/layer_norm_op_mlu.cc new file mode 100644 index 0000000000000..a368af86a3da6 --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op_mlu.cc @@ -0,0 +1,234 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class LayerNormMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + const auto epsilon = ctx.Attr("epsilon"); + const auto* x = ctx.Input("X"); + const auto* scale = ctx.Input("Scale"); + const auto* bias = ctx.Input("Bias"); + auto* y = ctx.Output("Y"); + auto* mean = ctx.Output("Mean"); + auto* variance = ctx.Output("Variance"); + + auto place = ctx.GetPlace(); + + y->mutable_data(place); + mean->mutable_data(place); + variance->mutable_data(place); + + const auto& x_dims = x->dims(); + std::vector scale_bias_axes; + std::vector mean_var_axes; + for (auto i = 0; i < x_dims.size(); ++i) { + if (i >= begin_norm_axis) { + scale_bias_axes.push_back(x_dims[i]); + } else { + mean_var_axes.push_back(x_dims[i]); + } + } + + MLUCnnlTensorDesc x_desc(*x); + MLUCnnlTensorDesc y_desc(*y); + MLUCnnlTensorDesc mean_var_desc(mean_var_axes.size(), mean_var_axes.data(), + ToCnnlDataType()); + // cnnl only support both of scale and bias is NULL or not. + if (!scale && !bias) { + MLUCnnl::LayerNormForward( + ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x), + nullptr /*scale_bias_desc*/, nullptr /*scale*/, nullptr /*bias*/, + epsilon, y_desc.get(), GetBasePtr(y), mean_var_desc.get(), + GetBasePtr(mean), GetBasePtr(variance)); + } else { + Tensor tmp_scale(x->dtype()); + if (!scale) { + tmp_scale.mutable_data(phi::make_ddim(scale_bias_axes), place); + FillMLUTensorWithHostValue(ctx, static_cast(1), &tmp_scale); + } else { + tmp_scale = *scale; + } + + Tensor tmp_bias(x->dtype()); + if (!bias) { + tmp_bias.mutable_data(phi::make_ddim(scale_bias_axes), place); + FillMLUTensorWithHostValue(ctx, static_cast(0), &tmp_bias); + } else { + tmp_bias = *bias; + } + + // scale and bias should have same type with x/y + MLUCnnlTensorDesc float32_desc(scale_bias_axes.size(), + scale_bias_axes.data(), CNNL_DTYPE_FLOAT); + MLUCnnlTensorDesc float16_desc(scale_bias_axes.size(), + scale_bias_axes.data(), CNNL_DTYPE_HALF); + cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16); + + Tensor final_scale(x->dtype()); + if (final_scale.dtype() == DataType::FLOAT16 && + tmp_scale.dtype() == DataType::FLOAT32) { + final_scale.mutable_data(phi::make_ddim(scale_bias_axes), place); + // cast scale to fp16 + MLUCnnl::Cast(ctx, cast_type, float32_desc.get(), + GetBasePtr(&tmp_scale), float16_desc.get(), + GetBasePtr(&final_scale)); + } else { + final_scale = tmp_scale; + } + + Tensor final_bias(x->dtype()); + if (final_bias.dtype() == DataType::FLOAT16 && + tmp_bias.dtype() == DataType::FLOAT32) { + final_bias.mutable_data(phi::make_ddim(scale_bias_axes), place); + // cast bias to fp16 + MLUCnnl::Cast(ctx, cast_type, float32_desc.get(), GetBasePtr(&tmp_bias), + float16_desc.get(), GetBasePtr(&final_bias)); + } else { + final_bias = tmp_bias; + } + + MLUCnnlTensorDesc scale_bias_desc( + scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType()); + MLUCnnl::LayerNormForward( + ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x), + scale_bias_desc.get(), GetBasePtr(&final_scale), + GetBasePtr(&final_bias), epsilon, y_desc.get(), GetBasePtr(y), + mean_var_desc.get(), GetBasePtr(mean), GetBasePtr(variance)); + } + } +}; + +template +class LayerNormGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + const auto* x = ctx.Input("X"); + const auto* mean = ctx.Input("Mean"); + const auto* variance = ctx.Input("Variance"); + const auto* scale = ctx.Input("Scale"); + const auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dscale = ctx.Output(framework::GradVarName("Scale")); + auto* dbias = ctx.Output(framework::GradVarName("Bias")); + + auto place = ctx.GetPlace(); + dx->mutable_data(place); + + const auto& x_dims = x->dims(); + std::vector scale_bias_axes; + std::vector mean_var_axes; + for (auto i = 0; i < x_dims.size(); ++i) { + if (i >= begin_norm_axis) { + scale_bias_axes.push_back(x_dims[i]); + } else { + mean_var_axes.push_back(x_dims[i]); + } + } + + MLUCnnlTensorDesc x_desc(*x); + MLUCnnlTensorDesc dy_desc(*dy); + MLUCnnlTensorDesc mean_var_desc(mean_var_axes.size(), mean_var_axes.data(), + ToCnnlDataType()); + MLUCnnlTensorDesc dx_desc(*dx); + + Tensor tmp_scale(x->dtype()); + if (!scale) { + tmp_scale.mutable_data(phi::make_ddim(scale_bias_axes), place); + FillMLUTensorWithHostValue(ctx, static_cast(1), &tmp_scale); + } else { + tmp_scale = *scale; + } + + MLUCnnlTensorDesc float32_desc(scale_bias_axes.size(), + scale_bias_axes.data(), CNNL_DTYPE_FLOAT); + MLUCnnlTensorDesc float16_desc(scale_bias_axes.size(), + scale_bias_axes.data(), CNNL_DTYPE_HALF); + cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16); + cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32); + + Tensor final_scale(x->dtype()); + if (final_scale.dtype() == DataType::FLOAT16 && + tmp_scale.dtype() == DataType::FLOAT32) { + final_scale.mutable_data(phi::make_ddim(scale_bias_axes), place); + // cast scale to fp16 + MLUCnnl::Cast(ctx, cast_fp32_to_fp16, float32_desc.get(), + GetBasePtr(&tmp_scale), float16_desc.get(), + GetBasePtr(&final_scale)); + } else { + final_scale = tmp_scale; + } + + Tensor tmp_dscale(x->dtype()); + if (dscale && (tmp_dscale.dtype() == dscale->dtype())) { + dscale->mutable_data(place); + tmp_dscale = *dscale; + } else { + tmp_dscale.mutable_data(phi::make_ddim(scale_bias_axes), place); + } + Tensor tmp_dbias(x->dtype()); + if (dbias && (tmp_dbias.dtype() == dbias->dtype())) { + dbias->mutable_data(place); + tmp_dbias = *dbias; + } else { + tmp_dbias.mutable_data(phi::make_ddim(scale_bias_axes), place); + } + + MLUCnnlTensorDesc scale_desc(scale_bias_axes.size(), scale_bias_axes.data(), + ToCnnlDataType()); + MLUCnnl::LayerNormBackward( + ctx, begin_norm_axis, x_desc.get(), GetBasePtr(x), dy_desc.get(), + GetBasePtr(dy), scale_desc.get(), GetBasePtr(&final_scale), + mean_var_desc.get(), GetBasePtr(mean), GetBasePtr(variance), + dx_desc.get(), GetBasePtr(dx), GetBasePtr(&tmp_dscale), + GetBasePtr(&tmp_dbias)); + + if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 && + dscale->dtype() == DataType::FLOAT32)) { + dscale->mutable_data(place); + MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(), + GetBasePtr(&tmp_dscale), float32_desc.get(), + GetBasePtr(dscale)); + } + if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 && + dbias->dtype() == DataType::FLOAT32)) { + dbias->mutable_data(place); + MLUCnnl::Cast(ctx, cast_fp16_to_fp32, float16_desc.get(), + GetBasePtr(&tmp_dbias), float32_desc.get(), + GetBasePtr(dbias)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(layer_norm, ops::LayerNormMLUKernel, + ops::LayerNormMLUKernel); +REGISTER_OP_MLU_KERNEL(layer_norm_grad, ops::LayerNormGradMLUKernel, + ops::LayerNormGradMLUKernel); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 0811407466ddc..2540af5d472c4 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -585,6 +585,19 @@ class MatMulOp : public framework::OperatorWithKernel { auto dim_x = GetDimForInput(*context, "X"); auto dim_y = GetDimForInput(*context, "Y"); + +#ifdef PADDLE_WITH_MKLDNN + // (jczaja): For NHWC execution output shape needs + // to be computed like instead x*y we are to do y*x + bool channelwise_onednn = + context->IsRunMKLDNNKernel() && + (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() == + framework::DataLayout::kNHWC); + if (channelwise_onednn) { + std::swap(dim_x, dim_y); + } +#endif + auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( RowMatrixFromVector(dim_x), 0, context->Attrs().Get("transpose_X")); @@ -770,6 +783,21 @@ class MatMulOp : public framework::OperatorWithKernel { framework::TransToProtoVarType(tensor.dtype()), tensor.place(), tensor.layout()); } else { +#ifdef PADDLE_WITH_MKLDNN + // When matmul is first oneDNN op in a chain (there was some non oneDNN op + // previously) + // then we also need to rotate shape NHWC -> NCWH + if ((expected_kernel_type.data_layout_ == + framework::DataLayout::kMKLDNN) && + (tensor.layout() != framework::DataLayout::kMKLDNN) && + paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == + framework::DataLayout::kNHWC) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + framework::DataLayout::kNHWC); + } +#endif return framework::OpKernelType(expected_kernel_type.data_type_, tensor.place(), tensor.layout()); } diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 01fa01e3c6ed0..55294331a9c85 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -274,6 +274,22 @@ class MatMulV2Op : public framework::OperatorWithKernel { framework::TransToProtoVarType(tensor.dtype()), tensor.place(), tensor.layout()); } else { +#ifdef PADDLE_WITH_MKLDNN + // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN + // op + // previously) + // then we also need to rotate shape NHWC -> NCWH + if ((expected_kernel_type.data_layout_ == + framework::DataLayout::kMKLDNN) && + (tensor.layout() != framework::DataLayout::kMKLDNN) && + paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == + framework::DataLayout::kNHWC) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + framework::DataLayout::kNHWC); + } +#endif return framework::OpKernelType(expected_kernel_type.data_type_, tensor.place(), tensor.layout()); } diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc new file mode 100644 index 0000000000000..cfc320da47fff --- /dev/null +++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +class FillConstantMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + FillConstantMKLDNNHandler(Tensor* out, dnnl::engine engine, + platform::Place cpu_place) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + const auto src0_md = dnnl::memory::desc( + {out->numel(), sizeof(T)}, platform::MKLDNNGetDataType(), + dnnl::memory::format_tag::ab); + + dnnl::primitive_attr attrs; + attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f}); + + this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_add, + src0_md, src1_md, src0_md); + } + + static const dnnl::memory::desc src1_md; +}; + +template +const dnnl::memory::desc FillConstantMKLDNNHandler::src1_md( + {1, sizeof(T)}, platform::MKLDNNGetDataType(), + dnnl::memory::format_tag::ab); + +template +class FillConstantMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& dnnl_engine = dev_ctx.GetEngine(); + + auto* out = ctx.Output("Out"); + T fill_value = CalculateFillValue(ctx); + + auto shape = GetShape(ctx); + out->Resize(shape); + + FillConstantMKLDNNHandler handler(out, dnnl_engine, ctx.GetPlace()); + + dnnl::memory constant_value_memory = + dnnl::memory(FillConstantMKLDNNHandler::src1_md, dnnl_engine, + reinterpret_cast(&fill_value)); + + auto src0_memory_p = handler.AcquireDstMemory(out); + auto fill_constant_p = handler.AcquireForwardPrimitive(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + fill_constant_p->execute(astream, {{DNNL_ARG_SRC_0, *src0_memory_p}, + {DNNL_ARG_SRC_1, constant_value_memory}, + {DNNL_ARG_DST, *src0_memory_p}}); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetPlainMKLDNNFormat(out->dims().size())); + } + + T CalculateFillValue(const framework::ExecutionContext& ctx) const { + const auto str_value = ctx.Attr("str_value"); + const auto float_value = ctx.Attr("value"); + + T value; + + if (str_value.empty()) { + value = static_cast(float_value); + } else { + // handle NaN/Inf first, which cannot be read from stream + if (str_value == "inf") { + value = static_cast(std::numeric_limits::infinity()); + } else if (str_value == "-inf") { + value = static_cast(-std::numeric_limits::infinity()); + } else if (str_value == "nan") { + value = static_cast(std::numeric_limits::quiet_NaN()); + } else { + std::stringstream convert_stream(str_value); + double tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } + } + + if (ctx.HasInput("ValueTensor")) { + const auto* value_tensor = ctx.Input("ValueTensor"); + PADDLE_ENFORCE_EQ( + value_tensor->numel(), 1, + platform::errors::InvalidArgument( + "When use Tensor as value to set Tensor value in fill_constant, " + "value input(ValueTensor) size must be 1, but got %d", + value_tensor->numel())); + value = value_tensor->data()[0]; + } + + return value; + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(fill_constant, MKLDNN, paddle::platform::CPUPlace, + ops::FillConstantMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake index c471ba62f609b..3ebfbdc50caab 100644 --- a/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake +++ b/paddle/fluid/operators/mkldnn/nhwc_op_tests.cmake @@ -1,2 +1 @@ -cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op activation_op pooling transpose_op scope device_context enforce executor) - +cc_test(test_mkldnn_op_nhwc SRCS mkldnn/test_mkldnn_op_nhwc.cc DEPS op_registry pool_op shape_op activation_op pooling transpose_op scope device_context enforce executor) diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc index f04c73ec0b249..517f782e18758 100644 --- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc @@ -32,6 +32,16 @@ class ShapeMKLDNNKernel : public framework::OpKernel { in_dims = in_var->Get().value().dims(); } else { in_dims = in_var->Get().dims(); + // Output of shape op is often fed as input to fill_constant ops + // and we need to rotate a shape otherwise Tensors of wrong shape may be + // allocated + if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() == + framework::DataLayout::kNHWC && + in_dims.size() >= 3) { + auto rdims = phi::vectorize(in_dims); + std::rotate(rdims.begin() + 1, rdims.begin() + 2, rdims.end()); + in_dims = phi::make_ddim(rdims); + } } auto* out_t = ctx.Output("Out"); out_t->Resize({in_dims.size()}); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 0e988557df626..4ff93ee3cd624 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -32,9 +32,12 @@ USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +USE_OP_ITSELF(shape); +USE_OP_DEVICE_KERNEL(shape, MKLDNN); PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(shape, CPU, ALL_LAYOUT); namespace paddle { namespace operators { @@ -154,5 +157,59 @@ TEST(test_pool2d_relu_relu_nhwc, cpu_place) { platform::errors::InvalidArgument( "Computed shape does not match expected shape")); } + +TEST(test_pool2d_shape_nhwc, cpu_place) { + framework::DDim dims({1, 4, 8, 512}); // NHWC shape + std::vector expected_dims{1, 3, 7, 512}; // NHWC expected shape + platform::CPUPlace p; + framework::Scope scope; + + InputVars input_name = {"x", + scope.Var("x")->GetMutable()}; + // Initialize input data + std::uniform_real_distribution dist(static_cast(10.0), + static_cast(20.0)); + std::mt19937 engine; + size_t numel = static_cast(phi::product(dims)); + input_name.tensor->Resize(dims); + auto data_ptr = input_name.tensor->mutable_data(p); + for (size_t i = 0; i < numel; ++i) { + data_ptr[i] = dist(engine); + } + + scope.Var("y")->GetMutable(); + auto *z = scope.Var("z")->GetMutable(); + + auto &pool = platform::DeviceContextPool::Instance(); + + // Make pool2d followed by shape. shape for NHWC should return + // as output tensor not-rotated shape of Pool ( + + auto ksize = std::vector(2, 2); + auto op_pool = framework::OpRegistry::CreateOp( + "pool2d", {{"X", {"x"}}}, {{"Out", {"y"}}}, + {{"pooling_type", {std::string("max")}}, + {"ksize", {ksize}}, + {"data_format", {std::string("NHWC")}}, + {"use_mkldnn", {true}}}); + + auto op_shape = framework::OpRegistry::CreateOp( + "shape", {{"Input", {"y"}}}, {{"Out", {"z"}}}, {{"use_mkldnn", {true}}}); + + op_pool->Run(scope, p); + op_shape->Run(scope, p); + + pool.Get(p)->Wait(); + + // repack tensor data into vector for easy comparison + auto *zdata = z->data(); + std::vector vzdata(zdata, zdata + z->numel()); + + // Verify shape of output + PADDLE_ENFORCE_EQ(vzdata, expected_dims, + platform::errors::InvalidArgument( + "Computed shape does not match expected shape")); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index eacab46800580..8c907ab0e8dec 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -805,17 +805,17 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { } /* static */ void MLUCnnl::ApplyAdam( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, - const void* grad, const void* lr, const void* beta1, const void* beta2, - const void* beta1_power, const void* beta2_power, const void* epsilon, - const bool use_nesterov, const cnnlTensorDescriptor_t var_desc, void* var, - const cnnlTensorDescriptor_t m_desc, void* m, - const cnnlTensorDescriptor_t v_desc, void* v) { + const ExecutionContext& ctx, const cnnlTensorDescriptor_t var_desc, + void* var, const cnnlTensorDescriptor_t m_desc, void* m, + const cnnlTensorDescriptor_t v_desc, void* v, + const cnnlTensorDescriptor_t grad_desc, const void* grad, const void* lr, + const void* beta1, const void* beta2, const void* beta1_power, + const void* beta2_power, const void* epsilon, const bool use_nesterov) { cnnlHandle_t handle = GetHandleFromCTX(ctx); PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdam( - handle, grad_desc, var, grad_desc, m, grad_desc, v, grad_desc, grad, lr, - beta1, beta2, beta1_power, beta2_power, epsilon, use_nesterov)); + handle, var_desc, var, m_desc, m, v_desc, v, grad_desc, grad, lr, beta1, + beta2, beta1_power, beta2_power, epsilon, use_nesterov)); } /* static */ void MLUCnnl::ApplyAdaMax( @@ -2077,6 +2077,45 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { } } +/* static */ void MLUCnnl::LayerNormForward( + const ExecutionContext& ctx, int axis, const cnnlTensorDescriptor_t x_desc, + const void* x, const cnnlTensorDescriptor_t weight_bias_desc, + const void* weight, const void* bias, float eps, + const cnnlTensorDescriptor_t y_desc, void* y, + const cnnlTensorDescriptor_t mean_rstd_desc, void* saved_mean, + void* saved_rstd) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetLayerNormOpWorkspaceSize(handle, axis, x_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlLayerNormForward(handle, x_desc, x, axis, weight_bias_desc, weight, + bias, eps, workspace_ptr, workspace_size, y_desc, y, + mean_rstd_desc, saved_mean, saved_rstd)); +} + +/* static */ void MLUCnnl::LayerNormBackward( + const ExecutionContext& ctx, int axis, const cnnlTensorDescriptor_t x_desc, + const void* x, const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z, + const cnnlTensorDescriptor_t weight_bias_desc, const void* weight, + const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean, + const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x, void* diff_weight, void* diff_bias) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlLayerNormBackward( + handle, x_desc, x, axis, diff_z_desc, diff_z, weight_bias_desc, weight, + mean_rstd_desc, saved_mean, saved_rstd, diff_x_desc, diff_x, diff_weight, + diff_bias)); +} + /* static */ void MLUCnnl::QuantizeParam( const ExecutionContext& ctx, const cnnlQuantizeMode_t mode, const int bitwidth, const cnnlTensorDescriptor_t input_desc, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 572b7aa2bbd01..24db6c760d78a 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -146,10 +146,8 @@ const std::map, cnnlCastDataType_t> {{VT::FP16, /*cast to*/ VT::BOOL}, CNNL_CAST_HALF_TO_BOOL}, {{VT::INT32, /*cast to*/ VT::FP32}, CNNL_CAST_INT32_TO_FLOAT}, {{VT::INT32, /*cast to*/ VT::FP16}, CNNL_CAST_INT32_TO_HALF}, - {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64}, - {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16}, {{VT::INT32, /*cast to*/ VT::INT8}, CNNL_CAST_INT32_TO_INT8}, - {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL}, + {{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16}, {{VT::INT16, /*cast to*/ VT::FP32}, CNNL_CAST_INT16_TO_FLOAT}, {{VT::INT16, /*cast to*/ VT::FP16}, CNNL_CAST_INT16_TO_HALF}, {{VT::INT16, /*cast to*/ VT::INT32}, CNNL_CAST_INT16_TO_INT32}, @@ -158,12 +156,21 @@ const std::map, cnnlCastDataType_t> {{VT::INT8, /*cast to*/ VT::INT32}, CNNL_CAST_INT8_TO_INT32}, {{VT::UINT8, /*cast to*/ VT::FP32}, CNNL_CAST_UINT8_TO_FLOAT}, {{VT::UINT8, /*cast to*/ VT::FP16}, CNNL_CAST_UINT8_TO_HALF}, - {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64}, - {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32}, {{VT::BOOL, /*cast to*/ VT::FP32}, CNNL_CAST_BOOL_TO_FLOAT}, {{VT::BOOL, /*cast to*/ VT::FP16}, CNNL_CAST_BOOL_TO_HALF}, {{VT::BOOL, /*cast to*/ VT::INT32}, CNNL_CAST_BOOL_TO_INT32}, + {{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32}, + {{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64}, {{VT::INT64, /*cast to*/ VT::INT32}, CNNL_CAST_INT64_TO_INT32}, + {{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL}, + {{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64}, + {{VT::INT8, /*cast to*/ VT::INT16}, CNNL_CAST_INT8_TO_INT16}, + {{VT::FP32, /*cast to*/ VT::FP64}, CNNL_CAST_FLOAT_TO_DOUBLE}, + {{VT::FP64, /*cast to*/ VT::FP32}, CNNL_CAST_DOUBLE_TO_FLOAT}, + {{VT::INT64, /*cast to*/ VT::FP32}, CNNL_CAST_INT64_TO_FLOAT}, + {{VT::INT64, /*cast to*/ VT::FP16}, CNNL_CAST_INT64_TO_HALF}, + {{VT::FP32, /*cast to*/ VT::INT64}, CNNL_CAST_FLOAT_TO_INT64}, + {{VT::FP16, /*cast to*/ VT::INT64}, CNNL_CAST_HALF_TO_INT64}, }; cnnlCastDataType_t GetCastDataType(const VT::Type& src_type, @@ -496,14 +503,14 @@ class MLUCnnl { const cnnlTensorDescriptor_t mom_desc, void* mom); static void ApplyAdam(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t m_desc, void* m, + const cnnlTensorDescriptor_t v_desc, void* v, const cnnlTensorDescriptor_t grad_desc, const void* grad, const void* lr, const void* beta1, const void* beta2, const void* beta1_power, const void* beta2_power, const void* epsilon, - const bool use_nesterov, - const cnnlTensorDescriptor_t var_desc, void* var, - const cnnlTensorDescriptor_t m_desc, void* m, - const cnnlTensorDescriptor_t v_desc, void* v); + const bool use_nesterov); static void ApplyAdaMax(const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, @@ -1103,6 +1110,24 @@ class MLUCnnl { const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop, void* scale_backprop, void* offset_backprop); + static void LayerNormForward(const ExecutionContext& ctx, int axis, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t weight_bias_desc, + const void* weight, const void* bias, float eps, + const cnnlTensorDescriptor_t y_desc, void* y, + const cnnlTensorDescriptor_t mean_rstd_desc, + void* saved_mean, void* saved_rstd); + + static void LayerNormBackward( + const ExecutionContext& ctx, int axis, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z, + const cnnlTensorDescriptor_t weight_bias_desc, const void* weight, + const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean, + const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x, void* diff_weight, void* diff_bias); + static void Transpose(const ExecutionContext& ctx, const std::vector perm, const int input_dim, const cnnlTensorDescriptor_t input_desc, @@ -1230,5 +1255,13 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx, GetBasePtr(transformed_output)); } +template +inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, T value, + Tensor* out) { + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), + GetBasePtr(out)); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc new file mode 100644 index 0000000000000..9d335021234eb --- /dev/null +++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc @@ -0,0 +1,285 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class AdamMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + platform::errors::InvalidArgument( + "The Grad(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Grad").front(), + framework::ToTypeName(param_var->Type()))); + auto* grad = ctx.Input("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, beta1_pow->place(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, beta2_pow->place(), + ctx.template device_context(), + beta2_pow_out); + return; + } + + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + param_out->ShareDataWith(*param); + mom1_out->ShareDataWith(*mom1); + mom2_out->ShareDataWith(*mom2); + + LoDTensor beta1_pow_tmp; + LoDTensor beta2_pow_tmp; + if (beta1_pow->place() == platform::CPUPlace()) { + T beta1 = *beta1_pow->data(); + beta1_pow_tmp.mutable_data({1}, ctx.GetPlace()); + MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta1, + beta1_pow_tmp_desc.get(), GetBasePtr(&beta1_pow_tmp)); + beta1_pow = &beta1_pow_tmp; + } + if (beta2_pow->place() == platform::CPUPlace()) { + T beta2 = *beta2_pow->data(); + beta2_pow_tmp.mutable_data({1}, ctx.GetPlace()); + MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta2, + beta2_pow_tmp_desc.get(), GetBasePtr(&beta2_pow_tmp)); + beta2_pow = &beta2_pow_tmp; + } + + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const Tensor* beta1_tensor = nullptr; + const Tensor* beta2_tensor = nullptr; + const Tensor* epsilon_tensor = nullptr; + + Tensor beta1_tmp(experimental::DataType::FLOAT32); + Tensor beta2_tmp(experimental::DataType::FLOAT32); + Tensor epsilon_tmp(experimental::DataType::FLOAT32); + + if (ctx.HasInput("Beta1Tensor")) { + beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + } else { + T beta1 = static_cast(ctx.Attr("beta1")); + beta1_tmp.mutable_data({1}, ctx.GetPlace()); + MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta1, beta1_tmp_desc.get(), + GetBasePtr(&beta1_tmp)); + beta1_tensor = &beta1_tmp; + } + + if (ctx.HasInput("Beta2Tensor")) { + beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + } else { + T beta2 = static_cast(ctx.Attr("beta2")); + beta2_tmp.mutable_data({1}, ctx.GetPlace()); + MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &beta2, beta2_tmp_desc.get(), + GetBasePtr(&beta2_tmp)); + beta2_tensor = &beta2_tmp; + } + + if (ctx.HasInput("EpsilonTensor")) { + epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + } else { + T epsilon = static_cast(ctx.Attr("epsilon")); + epsilon_tmp.mutable_data({1}, ctx.GetPlace()); + MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp); + MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &epsilon, + epsilon_tmp_desc.get(), GetBasePtr(&epsilon_tmp)); + epsilon_tensor = &epsilon_tmp; + } + + MLUCnnlTensorDesc param_desc(*param); + MLUCnnlTensorDesc mom1_desc(*mom1); + MLUCnnlTensorDesc mom2_desc(*mom2); + MLUCnnlTensorDesc grad_desc(*grad); + MLUCnnl::ApplyAdam(ctx, param_desc.get(), GetBasePtr(param_out), + mom1_desc.get(), GetBasePtr(mom1_out), mom2_desc.get(), + GetBasePtr(mom2_out), grad_desc.get(), GetBasePtr(grad), + GetBasePtr(lr), GetBasePtr(beta1_tensor), + GetBasePtr(beta2_tensor), GetBasePtr(beta1_pow), + GetBasePtr(beta2_pow), GetBasePtr(epsilon_tensor), + /*use_nesterov*/ false); + + if (!use_global_beta_pow) { + beta1_pow_out->mutable_data(ctx.GetPlace()); + beta2_pow_out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc beta1_desc(*beta1_tensor); + MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), beta1_desc.get(), + GetBasePtr(beta1_pow), beta1_desc.get(), + GetBasePtr(beta1_tensor), beta1_desc.get(), + GetBasePtr(beta1_pow_out), ToCnnlDataType()); + + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), beta1_desc.get(), + GetBasePtr(beta2_pow), beta1_desc.get(), + GetBasePtr(beta2_tensor), beta1_desc.get(), + GetBasePtr(beta2_pow_out), ToCnnlDataType()); + } + } +}; + +template +class AdamWMLUKernel : public AdamMLUKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + VLOG(3) << "MLU AdamW Kernel"; + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + VLOG(3) << "Has SkipUpdate"; + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); + skip_update = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update; + bool with_decay = ctx.Attr("with_decay"); + if (!skip_update && with_decay) { + if (ctx.HasInput("MasterParam")) { + PADDLE_THROW(platform::errors::Unimplemented( + "Master Param is not supported on MLU")); + } else { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + auto* param = ctx.Input("Param"); + auto* lr = ctx.Input("LearningRate"); + float coeff = ctx.Attr("coeff"); + + // update param with decay coeff: mul(-1 * lr, coeff * param) + param + MLUCnnlTensorDesc lr_desc(*lr); + MLUCnnlTensorDesc param_desc(*param); + MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), lr_desc.get(), GetBasePtr(lr), + param_desc.get(), GetBasePtr(param), param_desc.get(), + const_cast(GetBasePtr(param)), + ToCnnlDataType(), + /*alpha1*/ -1.f, /*alpha2*/ coeff, /*beta*/ 1.f); + } + } + AdamMLUKernel::Compute(ctx); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(adam, ops::AdamMLUKernel, + ops::AdamMLUKernel); + +REGISTER_OP_MLU_KERNEL(adamw, ops::AdamWMLUKernel, + ops::AdamWMLUKernel); diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index 90cf4128aae94..a9ee03e44738f 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -134,7 +134,7 @@ inline void StridedMemcpyWithAxis0( for (size_t i = 0; i < outputs->size(); ++i) { auto out_stride = stride_numel(shape_refer[i]->dims()); auto out = outputs->at(i); - if (out != nullptr && out->initialized()) { + if (out != nullptr && out->initialized() && out->numel() > 0) { StridedNumelCopyWithAxis(dev_ctx, axis, out->data(), out_stride, input.data() + input_offset, in_stride, out_stride[axis]); diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index eb82389702ca4..6da5d1244fbed 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -50,11 +50,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_uint64(gpu_memory_limit_mb); -#ifdef PADDLE_WITH_TESTING PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false, "Whether to print the message of gpu memory usage " "at exit, mainly used for UT and CI."); -#endif +PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true, + "Whether to print the message of gpu memory usage " + "MB as a unit of measurement."); constexpr static float fraction_reserve_gpu_memory = 0.05f; @@ -145,25 +146,32 @@ class RecordedGpuMallocHelper { mtx_.reset(new std::mutex()); } -#ifdef PADDLE_WITH_TESTING if (FLAGS_enable_gpu_memory_usage_log) { // A fake UPDATE to trigger the construction of memory stat instances, // make sure that they are destructed after RecordedGpuMallocHelper. MEMORY_STAT_UPDATE(Reserved, dev_id, 0); + MEMORY_STAT_UPDATE(Allocated, dev_id, 0); } -#endif } DISABLE_COPY_AND_ASSIGN(RecordedGpuMallocHelper); public: ~RecordedGpuMallocHelper() { -#ifdef PADDLE_WITH_TESTING if (FLAGS_enable_gpu_memory_usage_log) { - std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : " - << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) << std::endl; + if (FLAGS_enable_gpu_memory_usage_log_mb) { + std::cout << "[Memory Usage (MB)] gpu " << dev_id_ << " : Reserved = " + << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) / 1048576.0 + << ", Allocated = " + << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) / 1048576.0 + << std::endl; + } else { + std::cout << "[Memory Usage (Byte)] gpu " << dev_id_ << " : Reserved = " + << MEMORY_STAT_PEAK_VALUE(Reserved, dev_id_) + << ", Allocated = " + << MEMORY_STAT_PEAK_VALUE(Allocated, dev_id_) << std::endl; + } } -#endif } static RecordedGpuMallocHelper *Instance(int dev_id) { diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 4301ef4bcf126..61ea0fd3cd293 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -50,6 +50,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { return ncclInt64; } else if (type == framework::proto::VarType::FP16) { return ncclFloat16; + } else if (type == framework::proto::VarType::INT8) { + return ncclInt8; } else { PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc index b7a83b2ef1a61..2d0381cb8b3ea 100644 --- a/paddle/fluid/platform/device/ipu/ipu_device.cc +++ b/paddle/fluid/platform/device/ipu/ipu_device.cc @@ -39,7 +39,8 @@ const bool GetBoolEnv(const std::string& str) { int GetNumDevices() { bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); - if (ipu_model) { + bool compile_only = GetBoolEnv("IPU_COMPILE_ONLY"); + if (ipu_model || compile_only) { return 1; } int num_devices = @@ -52,7 +53,8 @@ int GetNumDevices() { std::vector GetDeviceIds() { bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); - if (ipu_model) { + bool compile_only = GetBoolEnv("IPU_COMPILE_ONLY"); + if (ipu_model || compile_only) { return {0}; } std::vector device_ids; diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index b020e4f219743..96c2b4f9a9ded 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -96,6 +96,7 @@ Executor::~Executor() { void Executor::Prepare(const std::string &proto) { VLOG(10) << "enter Executor::Prepare"; + compile_only_ = GetBoolEnv("IPU_COMPILE_ONLY"); AcquireDevice(); executor_resources_ = std::make_unique(); @@ -122,9 +123,18 @@ void Executor::Prepare(const std::string &proto) { } VLOG(10) << "Creating session from Onnx Model...done"; - VLOG(10) << "Preparing session device..."; - session_->prepareDevice(); - VLOG(10) << "Preparing session device...done"; + if (compile_only_) { + LOG(INFO) + << "Save the offline cache as offline_cache.popart in current path."; + VLOG(10) << "Compile only..."; + session_->compileAndExport("./offline_cache.popart"); + VLOG(10) << "Compile only...done"; + return; + } else { + VLOG(10) << "Preparing session device..."; + session_->prepareDevice(); + VLOG(10) << "Preparing session device...done"; + } SetWeightsIO(); @@ -141,6 +151,11 @@ void Executor::Prepare(const std::string &proto) { void Executor::Run(const std::vector &inputs, const std::vector &outputs, const framework::ExecutionContext &ctx) { + if (compile_only_) { + LOG(INFO) << "If IPU_COMPILE_ONLY=True, skip exe.run"; + return; + } + VLOG(10) << "enter Executor::Run"; // inputs std::map popart_inputs; @@ -222,6 +237,7 @@ void Executor::AcquireDevice() { bool use_ipu_model = GetBoolEnv("POPLAR_IPUMODEL"); bool enable_distribution = ipu_strategy_->enable_distribution; if (use_ipu_model) { + VLOG(10) << "Create IPU model device..."; std::map deviceOpts{ { "numIPUs", std::to_string(ipu_strategy_->num_ipus), @@ -230,7 +246,21 @@ void Executor::AcquireDevice() { }; device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( deviceOpts); + VLOG(10) << "Create IPU model device...done"; + } else if (compile_only_) { + VLOG(10) << "Create offline device..."; + std::map deviceOpts{ + { + "numIPUs", std::to_string(ipu_strategy_->num_ipus), + }, + {"ipuVersion", "ipu2"}, + }; + device_ = + popart::DeviceManager::createDeviceManager().createOfflineIPUDevice( + deviceOpts); + VLOG(10) << "Create offline device...done"; } else if (enable_distribution) { + VLOG(10) << "Create distribution device..."; auto ipus_per_replica = ipu_strategy_->num_ipus / ipu_strategy_->popart_options.replicatedGraphCount; auto device_id = popdist_get_device(ipus_per_replica); @@ -240,13 +270,16 @@ void Executor::AcquireDevice() { device_, errors::Unavailable("Can't attach IPU in distribution, ipu_num = %d.", RequestIpus(ipu_strategy_->num_ipus))); + VLOG(10) << "Create distribution device...done"; } else { + VLOG(10) << "Create IPU device..."; device_ = popart::DeviceManager::createDeviceManager().acquireAvailableDevice( RequestIpus(ipu_strategy_->num_ipus)); PADDLE_ENFORCE_NOT_NULL( device_, errors::Unavailable("Can't attach IPU, ipu_num = %d.", RequestIpus(ipu_strategy_->num_ipus))); + VLOG(10) << "Create IPU device...done"; } VLOG(10) << "leave Executor::AcquireDevice"; } diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.h b/paddle/fluid/platform/device/ipu/ipu_executor.h index c03a52a77a9d7..70c9477e69bab 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.h +++ b/paddle/fluid/platform/device/ipu/ipu_executor.h @@ -91,6 +91,7 @@ class Executor { const Scope *scope_ = nullptr; const IpuStrategy *ipu_strategy_ = nullptr; CompilerResources *compiler_resources_ = nullptr; + bool compile_only_ = false; // Deviceinfo for popart session std::shared_ptr device_; diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc index c9ac081f920da..74f262be8477a 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc @@ -57,14 +57,21 @@ Node *checkpointoutput_handler(Graph *graph, Node *node) { Node *custom_nll_loss_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction")); - auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignoreIndex")); + auto ignoreIndex = BOOST_GET_CONST(std::string, op->GetAttr("ignoreIndex")); auto inputIsLogProbability = BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability")); - return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs, - node->outputs, - {{"reduction", reduction}, - {"ignoreIndex", ignoreIndex}, - {"inputIsLogProbability", inputIsLogProbability}}); + if (ignoreIndex == "None") { + return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs, + node->outputs, + {{"reduction", reduction}, + {"inputIsLogProbability", inputIsLogProbability}}); + } else { + return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs, + node->outputs, + {{"reduction", reduction}, + {"ignoreIndex", std::atoi(ignoreIndex.c_str())}, + {"inputIsLogProbability", inputIsLogProbability}}); + } } Node *identity_handler(Graph *graph, Node *node) { diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 583014b6f4773..0dcab845bc9ca 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -130,6 +130,11 @@ XPUOpMap& get_kl2_ops() { {"elementwise_sub", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"elementwise_mod", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, {"equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -302,11 +307,13 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"reshape2_grad", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 17736a87409af..94c0124440ea9 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -78,13 +78,6 @@ tf_pd MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p, inline void MatchShapeToLayout(framework::Tensor* tensor_in, framework::DataLayout from, framework::DataLayout to) { - // In these data layouts, channel dimension is either on 2nd position: nChw or - // at last nhwC, so for dim==2 these layouts are the same and nothing should - // be done. Similarly for dim==1 when you have just one possible combination. - if (tensor_in->dims().size() < 3) { - return; - } - auto print_dims = [](const std::vector& dims) { std::ostringstream oss; @@ -101,6 +94,15 @@ inline void MatchShapeToLayout(framework::Tensor* tensor_in, return oss.str(); }; + // In these data layouts, channel dimension is either on 2nd position: nChw or + // at last nhwC, so for dim==2 these layouts are the same and nothing should + // be done. Similarly for dim==1 when you have just one possible combination. + if (tensor_in->dims().size() < 3) { + VLOG(3) << "Keeping kMKLDNN/kNHWC/kNDHWC output_shape" + << print_dims(phi::vectorize(tensor_in->dims())); + return; + } + switch (from) { case framework::DataLayout::kMKLDNN: if ((to == framework::DataLayout::kNHWC) || @@ -571,6 +573,12 @@ inline void RegisterModelLayout( std::vector>& ops, const platform::Place& place) { if (platform::is_cpu_place(place)) { + // If there is already registered NHWC then quit this call + // not to overwrite setting with analysis of internal "while" op block + if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() == + framework::DataLayout::kNHWC) + return; + VLOG(4) << "RegisterModelLayout for mkldnn"; auto check_attrib = [](std::unique_ptr& op, const std::string& attrib_name) -> bool { diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 7af221b9ac82e..0473c29a3342b 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -37,6 +37,13 @@ extern PyTypeObject* p_tensor_type; PyObject* tensor_properties_get_name(TensorObject* self, void* closure) { EAGER_TRY + // NOTE(dev): [why not use egr::Controller::Instance::GernerateUniqueName()?] + // Beacause Controller must holder a tracer, but 'tensor.name' maybe called + // everywhere such as static mode in @to_static, which means tracer is None. + static egr::UniqueNameGenerator name_generator; + if (self->tensor.name().empty()) { + self->tensor.set_name(name_generator.Generate()); + } return ToPyObject(self->tensor.name()); EAGER_CATCH_AND_THROW_RETURN_NULL } diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index e10ae8254a79e..b195ed1aefadc 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -162,7 +162,7 @@ cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place) cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool) cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) -cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) +cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 58827a98503ce..b00311061c9d0 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -15,12 +15,14 @@ limitations under the License. */ #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/transfer_layout_kernel.h" -#include "paddle/fluid/framework/data_device_transform.h" +#include "paddle/fluid/framework/tensor_util.h" namespace paddle { namespace experimental { @@ -139,9 +141,8 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype() << " dst_dtype: " << dtype; - phi::DenseTensor out( - phi::make_intrusive(tensor.place()), - {dtype, tensor.dims(), tensor.layout()}); + DefaultAllocator alloc(tensor.place()); + phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()}); if (platform::is_cpu_place(tensor.place())) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); @@ -158,6 +159,51 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, return out; } +inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, + Place dst_place) { + VLOG(3) << "DeviceTransform in, src_place " << tensor.place() + << " dst_place: " << dst_place; + + DefaultAllocator alloc(dst_place); + phi::DenseTensor out(&alloc, + {tensor.dtype(), tensor.dims(), tensor.layout()}); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto& pool = paddle::platform::DeviceContextPool::Instance(); + // NOTE(yy): TransDataPlace should wait for computation of input. + if (!platform::is_cuda_pinned_place(tensor.place())) { + pool.Get(tensor.place())->Wait(); + pool.Get(dst_place)->Wait(); + } else if (platform::is_gpu_place(dst_place)) { + auto* dev_ctx = static_cast(pool.Get(dst_place)); + phi::Copy(*dev_ctx, tensor, dst_place, false, &out); + + // Note: This is an empty callback, the only way is to "reference" + // tensor, so it will not be destructed until the kernels launched at + // current + // stream of given place is finished. + auto callback = [tensor, dst_place]() { + VLOG(4) << "Run callback of tensor:" << &tensor << " at place " + << dst_place; + }; + dev_ctx->AddStreamCallback(callback); + return out; + } +#endif + + // FIXME(zcd): TransDataPlace is used to transform data from GPU to CPU and + // the enforced checkings have been done in GetDeviceContext, so the + // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program + // slow, especially when the number of elements is little, for example, + // the elements of learning rate are one and it's CPU side. + // One solution is to use a CUDA kernel to complete the copy operation when + // the transforming is from CPU to GPU and the number of elements is little. + // But the embarrassment is that this solution this solution makes training + // slower. + paddle::framework::TensorCopySync(tensor, dst_place, &out); + return out; +} + phi::DenseTensor TransformData(const phi::DenseTensor& tensor, const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag) { @@ -174,10 +220,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor, if (NeedTransformPlace( out.place(), target_args_def.backend, transform_flag)) { - phi::DenseTensor result; - framework::TransDataDevice( - out, phi::TransToPhiPlace(target_args_def.backend), &result); - out = result; + out = TransDataPlace(out, phi::TransToPhiPlace(target_args_def.backend)); } return out; } diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc index 3a7869a062cf1..f2c98fded4d4f 100644 --- a/paddle/phi/kernels/cpu/adamw_kernel.cc +++ b/paddle/phi/kernels/cpu/adamw_kernel.cc @@ -17,6 +17,7 @@ #include #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" @@ -92,41 +93,101 @@ void AdamwDenseKernel(const Context& dev_ctx, return; } - auto* param_ = - master_param.is_initialized() ? master_param.get_ptr() : ¶m; + T beta1_ = beta1.to(); + T beta2_ = beta2.to(); + T epsilon_ = epsilon.to(); T coeff_ = static_cast(coeff); T lr_ratio_ = static_cast(lr_ratio); - funcs::AdamWFunctor functor( - coeff_, - lr_ratio_, - learning_rate.data(), - const_cast(param_->data())); - functor(param_->numel()); - - AdamDenseKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); + VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); + VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); + VLOG(3) << "param.numel(): " << param.numel(); + + PADDLE_ENFORCE_EQ( + beta1_pow_out->numel(), + 1, + errors::InvalidArgument("beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ( + beta2_pow_out->numel(), + 1, + errors::InvalidArgument("beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + T beta1_p = beta1_pow.data()[0]; + T beta2_p = beta2_pow.data()[0]; + + if (!use_global_beta_pow) { + dev_ctx.template Alloc(beta1_pow_out)[0] = beta1_ * beta1_p; + dev_ctx.template Alloc(beta2_pow_out)[0] = beta2_ * beta2_p; + } + + T* param_out_ptr = dev_ctx.template Alloc(param_out); + T* mom1_out_ptr = dev_ctx.template Alloc(moment1_out); + T* mom2_out_ptr = dev_ctx.template Alloc(moment2_out); + T old_lr = learning_rate.data()[0]; + T learning_rate_ = + learning_rate.data()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p)); + T eps = epsilon_ * sqrt(1 - beta2_p); + + int64_t numel = param.numel(); + + const T* param_ptr = param.data(); + const T* mom1_ptr = moment1.data(); + const T* mom2_ptr = moment2.data(); + const T* grad_ptr = grad.data(); + + auto adamw = + paddle::operators::jit::KernelFuncs, + phi::CPUPlace>::Cache() + .At(1); + + static constexpr int64_t chunk_size = 512; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < numel / chunk_size; ++i) { + const int64_t offset = i * chunk_size; + adamw(beta1_, + beta2_, + -learning_rate_, + eps, + old_lr, + lr_ratio_, + coeff_, + chunk_size, + grad_ptr + offset, + mom1_ptr + offset, + mom2_ptr + offset, + param_ptr + offset, + mom1_out_ptr + offset, + mom2_out_ptr + offset, + param_out_ptr + offset); + } + + if (numel % chunk_size != 0) { + const int64_t offset = (numel / chunk_size) * chunk_size; + const int64_t tail_numel = numel % chunk_size; + adamw(beta1_, + beta2_, + -learning_rate_, + eps, + old_lr, + lr_ratio_, + coeff_, + tail_numel, + grad_ptr + offset, + mom1_ptr + offset, + mom2_ptr + offset, + param_ptr + offset, + mom1_out_ptr + offset, + mom2_out_ptr + offset, + param_out_ptr + offset); + } } } // namespace phi diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index 3f5e0b8a4d8ee..ee384cc75193c 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -88,6 +88,16 @@ PD_REGISTER_KERNEL(minimum_grad, int, int64_t, phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(elementwise_heaviside_grad, + CPU, + ALL_LAYOUT, + phi::ElementwiseHeavisideGradKernel, + float, + double, + int, + int64_t) {} + PD_REGISTER_KERNEL(elementwise_pow_grad, CPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc index 7478f69d915f1..286b0d0ffaad9 100644 --- a/paddle/phi/kernels/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -95,6 +95,18 @@ void ElementwisePowRawKernel(const Context& dev_ctx, dev_ctx, x, y, axis, funcs::ElementwisePowFunctor(), out); } +template +void ElementwiseHeavisideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + // allocate memory for out + dev_ctx.template Alloc(out); + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::ElementwiseHeavisideFunctor(), out); +} + } // namespace phi using complex64 = ::phi::dtype::complex; @@ -149,3 +161,11 @@ PD_REGISTER_KERNEL(elementwise_pow_raw, double, int, int64_t) {} +PD_REGISTER_KERNEL(elementwise_heaviside_raw, + CPU, + ALL_LAYOUT, + phi::ElementwiseHeavisideRawKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index 6f2f2915ecf9e..b1e6ecaee6746 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -55,6 +55,15 @@ void MinimumGradKernel(const Context& dev_ctx, DenseTensor* dx, DenseTensor* dy); +template +void ElementwiseHeavisideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy); + template void ElementwisePowGradKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/elementwise_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc index 9d608cd86a6f7..5e29eb5ace675 100644 --- a/paddle/phi/kernels/elementwise_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -64,6 +64,15 @@ void ElementwisePowKernel(const Context& dev_ctx, ElementwisePowRawKernel(dev_ctx, x, y, axis, out); } +template +void ElementwiseHeavisideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + ElementwiseHeavisideRawKernel(dev_ctx, x, y, axis, out); +} + } // namespace phi using complex64 = ::phi::dtype::complex; @@ -91,6 +100,14 @@ PD_REGISTER_KERNEL( modulo, CPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL( floor_divide, CPU, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {} +PD_REGISTER_KERNEL(elementwise_heaviside, + CPU, + ALL_LAYOUT, + phi::ElementwiseHeavisideKernel, + float, + double, + int, + int64_t) {} PD_REGISTER_KERNEL(elementwise_pow, CPU, ALL_LAYOUT, @@ -126,6 +143,14 @@ PD_REGISTER_KERNEL( modulo, GPU, ALL_LAYOUT, phi::ModuloKernel, float, double, int, int64_t) {} PD_REGISTER_KERNEL( floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int, int64_t) {} +PD_REGISTER_KERNEL(elementwise_heaviside, + GPU, + ALL_LAYOUT, + phi::ElementwiseHeavisideKernel, + float, + double, + int, + int64_t) {} PD_REGISTER_KERNEL(elementwise_pow, KPS, ALL_LAYOUT, diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h index 37fe895d4051f..a39da52e7e3b5 100644 --- a/paddle/phi/kernels/elementwise_kernel.h +++ b/paddle/phi/kernels/elementwise_kernel.h @@ -98,6 +98,19 @@ void ElementwisePowKernel(const Context& dev_ctx, const DenseTensor& y, DenseTensor* out); +template +void ElementwiseHeavisideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void ElementwiseHeavisideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + template DenseTensor Maximum(const Context& dev_ctx, const DenseTensor& x, @@ -142,6 +155,17 @@ DenseTensor FloorDivide(const Context& dev_ctx, return dense_out; } +template +DenseTensor ElementwiseHeaviside(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + ElementwiseHeavisideKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + template DenseTensor ElementwisePow(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index aafa40a3d01bf..38cd41d3b6130 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -242,6 +242,27 @@ __device__ __forceinline__ void LoadData( } } +template +__device__ __forceinline__ void LoadData( + T *dst, + const _ptr_ T *src, + uint32_t block_offset, + const kps::details::BroadcastConfig &config, + int numel, + int num, + int need_broadcast, + int read_lens) { + // numel : whole num of output + // num: how many data will be deal with in this time + if (need_broadcast) { + kps::ReadDataBc( + dst, src, block_offset, config, numel, read_lens); + } else { + kps::ReadData( + dst, src + block_offset, num, read_lens); + } +} + template , Arity> &configs, int num, int block_offset, + int read_lens, Functor func) { - InT args[Arity][VecSize]; - ConditionalT result[VecSize]; + __simd__ InT args[Arity][VecSize]; + __simd__ ConditionalT result[VecSize]; #pragma unroll for (int i = 0; i < Arity; i++) { - kps::Init(args[i], static_cast(1.0f)); + kps::Init(args[i], static_cast(1.0f), read_lens); LoadData(args[i], ins[i], block_offset, configs[i], numel, num, - use_broadcast[i]); + use_broadcast[i], + read_lens); } constexpr bool kCallElementwiseAny = paddle::platform::FunctionTraits::has_pointer_args; @@ -281,10 +304,10 @@ __device__ void VectorizedBroadcastKernelImpl( Functor, Arity, kCallElementwiseAny>()( - func, args, result); - - phi::funcs::ElementwiseWriteDataCaller()( - outs, result, block_offset, num); + func, args, result, read_lens); + phi::funcs:: + ElementwiseWriteDataCallerBc()( + outs, result, block_offset, num, read_lens); } template , Arity> configs, int main_offset, int tail_tid, + int read_lens, Functor func) { - int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; - int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + int block_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens; + int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens; #ifdef PADDLE_WITH_XPU_KP for (; block_offset < main_offset; block_offset += stride) { @@ -320,8 +344,9 @@ __global__ void VectorizedBroadcastKernel( use_broadcast, numel, configs, - BLOCK_NUM_X * VecSize, + BLOCK_NUM_X * read_lens, block_offset, + read_lens, func); } int num = numel - block_offset; @@ -333,8 +358,15 @@ __global__ void VectorizedBroadcastKernel( NumOuts, VecSize, Rank, - true>( - ins, outs, use_broadcast, numel, configs, num, block_offset, func); + true>(ins, + outs, + use_broadcast, + numel, + configs, + num, + block_offset, + read_lens, + func); } #else if (block_offset < main_offset) { @@ -352,6 +384,7 @@ __global__ void VectorizedBroadcastKernel( configs, BLOCK_NUM_X * VecSize, block_offset, + read_lens, func); } else { VectorizedBroadcastKernelImpl( - ins, outs, use_broadcast, numel, configs, tail_tid, block_offset, func); + true>(ins, + outs, + use_broadcast, + numel, + configs, + tail_tid, + block_offset, + read_lens, + func); } #endif } @@ -392,6 +432,19 @@ void LaunchBroadcastKernel(const KPDevice &ctx, for (int i = 0; i < Arity; i++) { use_broadcast[i] = (ins[i]->numel() != numel); ins_data[i] = (const _ptr_ InT *)(ins[i]->data()); +#ifdef PADDLE_WITH_XPU_KP + if (i == 0) { + configs[i] = kps::details::BroadcastConfig(merge_dims.out_dims, + merge_dims.in_dims[0], + merge_dims.in_dims[1], + merge_dims.dim_size); + } else if (i == 1) { + configs[i] = kps::details::BroadcastConfig(merge_dims.out_dims, + merge_dims.in_dims[1], + merge_dims.in_dims[0], + merge_dims.dim_size); + } +#else if (use_broadcast[i]) { // get the broadcast config, // if data shape is[m, n], then you should set data_dim = {n, m} @@ -399,28 +452,50 @@ void LaunchBroadcastKernel(const KPDevice &ctx, configs[i] = kps::details::BroadcastConfig( merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size); } +#endif } #ifdef PADDLE_WITH_XPU_KP const int threads = 64; const int blocks = 8; - int main_offset = (numel / (VecSize * threads)) * VecSize * threads; - int tail_tid = numel % (VecSize * threads); + int read_lens = configs[0].buf_len; + int main_offset = (numel / (read_lens * threads)) * read_lens * threads; + int tail_tid = numel % (read_lens * threads); auto stream = ctx.x_context()->xpu_stream; - VectorizedBroadcastKernel<<>>(ins_data, - outs_data, - use_broadcast, - numel, - configs, - main_offset, - tail_tid, - func); + if (configs[0].cmp_type != kps::details::OptType::CanNotOptimize) { + main_offset = numel; + VectorizedBroadcastKernel<<>>(ins_data, + outs_data, + use_broadcast, + numel, + configs, + main_offset, + tail_tid, + read_lens, + func); + } else { + VectorizedBroadcastKernel<<>>(ins_data, + outs_data, + use_broadcast, + numel, + configs, + main_offset, + tail_tid, + read_lens, + func); + } #else const int threads = 256; int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; @@ -440,6 +515,7 @@ void LaunchBroadcastKernel(const KPDevice &ctx, configs, main_offset, tail_tid, + VecSize, func); #endif } diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 332ec0b0312da..4ee46facc7913 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -577,14 +577,16 @@ template struct ElementwisePrimitiveCaller { __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { + OutT *result, + int read_lens) { kps::ElementwiseAny( result, args, func); } @@ -594,7 +596,8 @@ template struct ElementwisePrimitiveCaller { __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { + OutT *result, + int read_lens) { kps::ElementwiseConstant(result, func); } }; @@ -603,7 +606,8 @@ template struct ElementwisePrimitiveCaller { __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { + OutT *result, + int read_lens) { kps::ElementwiseUnary( result, args[0], func); } @@ -613,9 +617,10 @@ template struct ElementwisePrimitiveCaller { __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { + OutT *result, + int read_lens) { kps::ElementwiseBinary( - result, args[0], args[1], func); + result, args[0], args[1], func, read_lens); } }; @@ -623,7 +628,8 @@ template struct ElementwisePrimitiveCaller { __device__ inline void operator()(Functor func, InT (*args)[VecSize], - OutT *result) { + OutT *result, + int read_lens) { kps::ElementwiseTernary( result, args[0], args[1], args[2], func); } @@ -696,6 +702,42 @@ struct ElementwiseWriteDataCaller { } }; +template +struct ElementwiseWriteDataCallerBc { + __device__ __forceinline__ void operator()( + phi::Array<_ptr_ OutT *, NumOuts> outs, + ConditionalT src[VecSize], + int block_offset, + int num, + int read_lens) { + OutT dst[NumOuts][VecSize]; +#pragma unroll + for (int i = 0; i < read_lens; ++i) { +#pragma unroll + for (int j = 0; j < NumOuts; ++j) { + dst[j][i] = (src[i])[j]; + } + } +#pragma unroll + for (int i = 0; i < NumOuts; ++i) { + kps::WriteData( + outs[i] + block_offset, dst[i], num, read_lens); + } + } +}; + +template +struct ElementwiseWriteDataCallerBc { + __device__ __forceinline__ void operator()(phi::Array<_ptr_ OutT *, 1> outs, + OutT src[VecSize], + int block_offset, + int num, + int read_lens) { + kps::WriteData( + outs[0] + block_offset, src, num, read_lens); + } +}; + template +struct ElementwiseHeavisideFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { + return a == static_cast(0) ? b : static_cast(a > 0); + } +}; + template struct FloorDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 42fee14488373..df14b0a21f24d 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -473,7 +473,11 @@ struct ReduceConfig { bool not_higher = x_dim[0] >= max_grid_z; #endif if (reduce_last_dim && (reduce_rank == 1)) { +#ifdef PADDLE_WITH_XPU_KP + reduce_type = static_cast(ReduceType::kReduceAny); +#else reduce_type = static_cast(ReduceType::kReduceLastDim); +#endif } else if (reduce_rank == 1) { reduce_type = static_cast(ReduceType::kReduceHigherDim); if (rank == 3 && not_higher) { @@ -588,7 +592,7 @@ struct ReduceConfig { void SetBlockDim() { // init should_reduce_again = false; - dim3 block_dim; + dim3 block_dim(1, 1, 1); dim3 grid_dim(left_num, 1, 1); blocking_size = reduce_num; diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index c814e7b3bb63d..3e7430fd84eaf 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -128,6 +128,16 @@ PD_REGISTER_KERNEL(minimum_grad, int64_t, phi::dtype::float16, phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(elementwise_heaviside_grad, + GPU, + ALL_LAYOUT, + phi::ElementwiseHeavisideGradKernel, + float, + double, + int, + int64_t) {} + PD_REGISTER_KERNEL(elementwise_pow_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h index bd0143379ce15..2b087f8dcae09 100644 --- a/paddle/phi/kernels/impl/einsum_grad_impl.h +++ b/paddle/phi/kernels/impl/einsum_grad_impl.h @@ -148,14 +148,16 @@ void EinsumGradKernel(const Context& dev_ctx, right = splits[1].substr(1); auto equation_for_A = - right + "," + ops[1] + "->" + gather_labels_except_reduction(ops[0]); + ops[1] + "," + right + "->" + gather_labels_except_reduction(ops[0]); auto equation_for_B = right + "," + ops[0] + "->" + gather_labels_except_reduction(ops[1]); auto operands_for_A = std::vector(); auto operands_for_B = std::vector(); DenseTensor dA, dB; - operands_for_A.push_back(&out_grad); + // dA = einsum(B, dC) operands_for_A.push_back(x[1]); + operands_for_A.push_back(&out_grad); + // dB = einsum(dC, A) operands_for_B.push_back(&out_grad); operands_for_B.push_back(x[0]); diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h index 73940a45cbde2..901147734b29f 100644 --- a/paddle/phi/kernels/impl/einsum_impl.h +++ b/paddle/phi/kernels/impl/einsum_impl.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" @@ -55,7 +56,8 @@ inline static void ValidationCheck(const std::string& equation) { enum LabelType { ALL_TYPE = 0, Batch = 1, // ABO - Free, // AO, BO + AO, // AO -- free label + BO, // BO -- free label Contraction, // AB Reduction, // A, B }; @@ -125,18 +127,32 @@ inline std::vector union_labels(const std::vector& a, return res; } +// Apply transforms to all_labels and get another all_labels +inline std::vector TransformLabelsOrder( + const std::vector& all_labels, + const LabelMap& type, + std::vector new_order) { + std::vector ret; + for (auto cnt_type : new_order) { + std::vector tmp; + for (int c : all_labels) { + if (type[c] == cnt_type) tmp.push_back(c); + std::sort(tmp.begin(), tmp.end()); + } + ret.insert(ret.end(), tmp.begin(), tmp.end()); + } + return ret; +} + inline static void GlobalInfo(const std::vector& op_labels, const std::string& right, LabelMap* label2type, std::vector* sorted_labels) { - // sorted_labels: ['.', , ] - VLOG(5) << "GlobalInfo: " - << paddle::string::join_strings(*sorted_labels, ","); std::vector all; LabelMap counter(0); for (auto& ch : right) { // char int c = ch; - (*label2type)[c] = LabelType::Free; + (*label2type)[c] = LabelType::BO; } for (auto& op : op_labels) { @@ -146,39 +162,36 @@ inline static void GlobalInfo(const std::vector& op_labels, all.push_back(ch); } counter[c] += 1; - if ((*label2type)[c] != LabelType::Free && counter[c] == 2) + if ((*label2type)[c] != LabelType::BO && counter[c] == 2) (*label2type)[c] = LabelType::Contraction; else if (counter[c] == 2) (*label2type)[c] = LabelType::Batch; } } + + // BO is represent Free, so we need find the AO. + for (int c : op_labels[0]) { + if ((*label2type)[c] == LabelType::BO) (*label2type)[c] = LabelType::AO; + } + (*label2type)['.'] = LabelType::Batch; - std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) { - if ((*label2type)[c] == LabelType::Batch) - sorted_labels->push_back(static_cast(c)); - }); - std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) { - if ((*label2type)[c] == LabelType::Free) - sorted_labels->push_back(static_cast(c)); - }); - std::for_each(all.begin(), all.end(), [sorted_labels, label2type](int c) { - if ((*label2type)[c] == LabelType::Contraction) - sorted_labels->push_back(static_cast(c)); - }); - std::for_each(all.begin(), all.end(), [&sorted_labels, label2type](int c) { - if ((*label2type)[c] == LabelType::Reduction) - sorted_labels->push_back(static_cast(c)); - }); - VLOG(5) << "GlobalInfo: sorted_labels before: " - << paddle::string::join_strings(*sorted_labels, ","); + + *sorted_labels = TransformLabelsOrder(all, + *label2type, + {LabelType::Batch, + LabelType::AO, + LabelType::BO, + LabelType::Contraction, + LabelType::Reduction}); + if (counter[static_cast('.')] > 0) { std::vector tmp; tmp.push_back('.'); // push '.' in the front *sorted_labels = union_labels(tmp, *sorted_labels); - VLOG(5) << "GlobalInfo: sorted_labels after: " - << paddle::string::join_strings(*sorted_labels, ","); } + VLOG(5) << "GlobalInfo: sorted_labels after: " + << paddle::string::join_strings(*sorted_labels, ","); } inline static void InferLabelShape(const std::vector& op_labels, @@ -289,17 +302,20 @@ inline static void ParseEinsumEquation( *right = results[1].substr(1); ReplaceEllipsis(*right); auto op_labels = paddle::string::split_string(left, ","); + // split_string("i,") -> ["i"], we expect 2 op_labels. + if (left[left.size() - 1] == ',') op_labels.push_back(""); std::for_each(op_labels.begin(), op_labels.end(), ReplaceEllipsis); GlobalInfo(op_labels, *right, labeltype, all_labels); InferLabelShape(op_labels, inputs, labelshape, ellipsis_dims, broadcast_dims); - VLOG(5) << "Einsum Infershape: right:" << right; - VLOG(5) << "Einsum Infershape: op_labels:" - << paddle::string::join_strings(op_labels, "\n"); + VLOG(5) << "Einsum Infershape: right:" << *right; + VLOG(5) << "Einsum Infershape: left :" + << paddle::string::join_strings(op_labels, '\n'); InferOutputDims(*right, *broadcast_dims, *labelshape, output_dims); for (size_t i = 0; i < inputs.size(); ++i) { InferLabelPerm( op_labels[i], ellipsis_dims->at(i).size(), &((*label2perms)[i])); } + VLOG(5) << "Einsum Infershape: end"; } template @@ -327,10 +343,12 @@ std::vector GetShapeByType(const std::vector& all_labels, const LabelMap& perm, const LabelMap& label2shape, const std::vector& ellipsis, - LabelType filter) { + std::set filter) { std::vector res; for (T c : all_labels) { - if ((filter == LabelType::ALL_TYPE || type[c] == filter) && perm[c] != -1) { + if ((filter.count(LabelType::ALL_TYPE) || + filter.count(LabelType(type[c]))) && + perm[c] != -1) { if (c == '.') res.insert(res.end(), ellipsis.begin(), ellipsis.end()); else @@ -390,7 +408,8 @@ DenseTensor PerformContraction( const LabelMap& label2type, const LabelMap& label2shape, const std::vector>& ellipsis_dims, - const std::vector& broadcast_dims) { + const std::vector& broadcast_dims, + std::vector cache) { // Get All the Batches, so perm is auto all_valid = LabelMap(1); auto recover_dim = GetShapeByType(all_labels, @@ -398,36 +417,74 @@ DenseTensor PerformContraction( all_valid, label2shape, broadcast_dims, - LabelType::Batch); + {LabelType::Batch}); auto preprocess = [&](const DenseTensor& t, const LabelMap& perm, - const std::vector& ellipsis) -> DenseTensor { - auto frees = GetShapeByType( - all_labels, label2type, perm, label2shape, ellipsis, LabelType::Free); + const std::vector& ellipsis, + int operand_idx) -> DenseTensor { + // reshape + auto frees = GetShapeByType(all_labels, + label2type, + perm, + label2shape, + ellipsis, + {LabelType::AO, LabelType::BO}); auto conts = GetShapeByType(all_labels, label2type, perm, label2shape, ellipsis, - LabelType::Contraction); - auto trans_t = PerformTranspose( - dev_ctx, t, perm, all_labels, ellipsis, label2type); - auto mul_dims = GetShapeByType( - all_labels, label2type, perm, label2shape, ellipsis, LabelType::Batch); + {LabelType::Contraction}); + std::vector reordered_all_labels = all_labels; + if (operand_idx == 1) { + reordered_all_labels = TransformLabelsOrder(all_labels, + label2type, + {LabelType::Batch, + LabelType::Contraction, + LabelType::AO, + LabelType::BO, + LabelType::Reduction}); + } + // reduction + DenseTensor trans_t; + if (cache[operand_idx]->IsInitialized()) { + trans_t.ShareBufferWith(*(cache[operand_idx])); + } else { + auto reduct_t = PerformReduction( + dev_ctx, t, perm, all_labels, ellipsis, label2type); + trans_t = PerformTranspose( + dev_ctx, reduct_t, perm, reordered_all_labels, ellipsis, label2type); + cache[operand_idx]->ShareBufferWith(trans_t); + } + auto mul_dims = GetShapeByType(all_labels, + label2type, + perm, + label2shape, + ellipsis, + {LabelType::Batch}); recover_dim.insert(recover_dim.end(), frees.begin(), frees.end()); - mul_dims.push_back( - std::accumulate(frees.begin(), frees.end(), 1, std::multiplies())); - mul_dims.push_back( - std::accumulate(conts.begin(), conts.end(), 1, std::multiplies())); + if (operand_idx == 0) { + mul_dims.push_back(std::accumulate( + frees.begin(), frees.end(), 1, std::multiplies())); + mul_dims.push_back(std::accumulate( + conts.begin(), conts.end(), 1, std::multiplies())); + } else { + mul_dims.push_back(std::accumulate( + conts.begin(), conts.end(), 1, std::multiplies())); + mul_dims.push_back(std::accumulate( + frees.begin(), frees.end(), 1, std::multiplies())); + } VLOG(5) << "PerformContraction: mul_dims: " << paddle::string::join_strings(mul_dims, ","); trans_t.Resize(make_ddim(mul_dims)); return trans_t; }; - auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0]); - auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1]); + + // Reduction, Reshape and Matmul + auto trans_a = preprocess(A, label2perm[0], ellipsis_dims[0], 0); + auto trans_b = preprocess(B, label2perm[1], ellipsis_dims[1], 1); auto after_contraction = - Matmul(dev_ctx, trans_a, trans_b, false, true); + Matmul(dev_ctx, trans_a, trans_b, false, false); VLOG(5) << "PerformContraction: recover_dim: " << paddle::string::join_strings(recover_dim, ","); after_contraction.Resize(make_ddim(recover_dim)); @@ -465,10 +522,11 @@ void TransposeToOutput(const Context& dev_ctx, } template -void EinsumKernel(const Context& dev_ctx, - const std::vector& inputs, - const std::string& equation, - DenseTensor* out) { +void EinsumKernelImpl(const Context& dev_ctx, + const std::vector& inputs, + const std::string& equation, + DenseTensor* out, + std::vector cache) { ValidationCheck(equation); // collect the following informations to prepare einsum. LabelMap labelshape(0); @@ -498,22 +556,18 @@ void EinsumKernel(const Context& dev_ctx, if (inputs.size() == 2) { auto& A = inputs[0]; auto& B = inputs[1]; - // Reduce Procedure - auto reduce_A = PerformReduction( - dev_ctx, *A, label2perms[0], all_labels, ellipsis_dims[0], labeltype); - auto reduce_B = PerformReduction( - dev_ctx, *B, label2perms[1], all_labels, ellipsis_dims[1], labeltype); - // Contract Procedure + // Reduction and Contract Procedure dev_ctx.template Alloc(out); auto after_contraction = PerformContraction(dev_ctx, - reduce_A, - reduce_B, + *A, + *B, label2perms, all_labels, labeltype, labelshape, ellipsis_dims, - broadcast_dims); + broadcast_dims, + cache); TransposeToOutput(dev_ctx, after_contraction, right, @@ -545,4 +599,18 @@ void EinsumKernel(const Context& dev_ctx, } } +template +void EinsumKernel(const Context& dev_ctx, + const std::vector& inputs, + const std::string& equation, + DenseTensor* out) { + std::vector cache(inputs.size()); // set empty; TA, TB, TdC + std::vector cache_tensor( + inputs.size()); // set empty; TA, TB, TdC + for (size_t i = 0; i < inputs.size(); ++i) { + cache_tensor[i] = &cache[i]; + } + EinsumKernelImpl(dev_ctx, inputs, equation, out, cache_tensor); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index fa1f15672b903..5d365786001a3 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -683,6 +683,43 @@ struct MinGradDy { } }; +template +struct HeavisideGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast(0); + } +}; + +template +struct HeavisideGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast(x == static_cast(0)); + } +}; + +template +void ElementwiseHeavisideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + funcs::ElementwiseGradPreProcess(dout, dx); + phi::funcs:: + ElemwiseGradCompute, HeavisideGradDy>( + dev_ctx, + x, + y, + dout, + dout, + axis, + dx, + dy, + HeavisideGradDx(), + HeavisideGradDy()); +} + template struct PowGradDX { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu index 821fda52ab102..d387096a70b75 100644 --- a/paddle/phi/kernels/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -54,6 +54,8 @@ void FloorDivideKernel(const Context& dev_ctx, int axis = -1; FloorDivideRawKernel(dev_ctx, x, y, axis, out); } +// Create the definition of Heaviside +DEFINE_CUDA_ELEMENTWISE_OP(ElementwiseHeaviside) // Create the definition of Pow DEFINE_CUDA_ELEMENTWISE_OP(ElementwisePow) template @@ -130,6 +132,14 @@ PD_REGISTER_KERNEL(floor_divide_raw, phi::FloorDivideRawKernel, int, int64_t) {} +PD_REGISTER_KERNEL(elementwise_heaviside_raw, + KPS, + ALL_LAYOUT, + phi::ElementwiseHeavisideRawKernel, + float, + double, + int, + int64_t) {} PD_REGISTER_KERNEL(elementwise_pow_raw, KPS, ALL_LAYOUT, diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index e02f4450a8bab..fabc6c0d13e7c 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -271,6 +271,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, } } +template +__device__ __forceinline__ void ElementwiseBinary( + OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) { +#pragma unroll + for (int idx = 0; idx < NX * NY; ++idx) { + out[idx] = static_cast(compute(in1[idx], in2[idx])); + } +} + /** * @brief Ternary calculation according to OpFunc. Shape of input and output * are the same. diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h index 4d65dd6dd5d87..eb45def836edc 100644 --- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h @@ -17,6 +17,7 @@ #include "xpu/kernel/cluster_header.h" #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" +#include "xpu/kernel/simd_header.h" namespace phi { namespace kps { @@ -158,6 +159,19 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, } } +template +__device__ __forceinline__ void ElementwiseBinary( + OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) { + for (int idx = 0; idx < read_lens; ++idx) { + out[idx] = static_cast(compute(in1[idx], in2[idx])); + } +} + /** * @brief Ternary calculation according to OpFunc. Shape of input and output * are the same. @@ -329,14 +343,12 @@ __device__ __forceinline__ void Reduce(T* out, ReduceFunctor reducer, bool reduce_last_dim) { if (Mode == details::kGlobalMode) { + if (reduce_last_dim) { #pragma unroll - for (int i = 0; i < NY; ++i) { -#pragma unroll - for (int j = 0; j < NX; ++j) { - out[i] = reducer(out[i], in[i * NX + j]); + for (int i = 0; i < NY * NX; i++) { // reduce along blockDim.x + details::BlockXReduce(&out[i], reducer); } } - details::BlockXReduce(out, reducer); } else { // else kLocalMode #pragma unroll for (int i = 0; i < NY; ++i) { diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h index 993349f2d9e14..ea1a830f89ab5 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives.h +++ b/paddle/phi/kernels/primitive/datamover_primitives.h @@ -246,6 +246,14 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { } } +template +__device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) { +#pragma unroll + for (int i = 0; i < NX; i++) { + dst[i] = init_data; + } +} + /** * The difference from the above function is that * it supports different data types of inputs. @@ -311,6 +319,38 @@ __device__ __forceinline__ void ReadData(T* dst, } } +template +__device__ __forceinline__ void ReadData(T* dst, + const T* __restrict__ src, + int num, + int read_lens) { + if (IsBoundary) { // blockDim.x * NX > num + int thread_offset = threadIdx.x * NX; +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (idx + thread_offset < num) { + dst[idx] = src[thread_offset + idx]; + } + } + } else { // blockDim,x * NX < num + constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1; + constexpr int kVectorsPerThread = NX / kVectorSize; + int thread_offset = threadIdx.x * kVectorsPerThread; + + using VecType = details::VectorType; + const VecType* vec_input = reinterpret_cast(src); + VecType vec_temp[kVectorsPerThread]; + +#pragma unroll + for (int i = 0; i < kVectorsPerThread; ++i) { + vec_temp[i] = vec_input[thread_offset + i]; +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + dst[idx] = *(reinterpret_cast(vec_temp) + idx); + } + } + } +} /** * @brief Read 1D data from global memory to register. The difference * from the above function is that it supports different data types of inputs. @@ -576,6 +616,36 @@ __device__ __forceinline__ void WriteData(T* dst, } } +template +__device__ __forceinline__ void WriteData(T* dst, + T* __restrict__ src, + int num, + int read_lens) { + if (IsBoundary) { + int thread_offset = threadIdx.x * NX; +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if ((thread_offset + idx) < num) { + dst[thread_offset + idx] = src[idx]; + } + } + } else { + // Vector type + constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1; + constexpr int kVectorsPerThread = NX / kVectorSize; + + int thread_offset = threadIdx.x * kVectorsPerThread; + using VecType = details::VectorType; + VecType* vec_dst = reinterpret_cast(dst); + VecType vec_temp[kVectorsPerThread]; +#pragma unroll + for (int idx = 0; idx < kVectorsPerThread; ++idx) { + vec_temp[idx] = *(reinterpret_cast(src) + idx); + vec_dst[thread_offset + idx] = vec_temp[idx]; + } + } +} + /** * @brief Write 2D data from register to global memory according to Tx type, and * store it as Ty type. @@ -749,6 +819,40 @@ __device__ __forceinline__ void ReadDataBc( } } +template +__device__ __forceinline__ void ReadDataBc( + T* dst, + const T* __restrict__ src, + uint32_t block_offset, + details::BroadcastConfig config, + int total_num_output, + int read_lens) { + uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t index_src = 0; + +#pragma unroll + for (uint32_t nx = 0; nx < NX; ++nx) { + uint32_t index_output = thread_offset + nx; + index_src = 0; + if (IsBoundary) { + if (index_output >= total_num_output) { + break; + } + } +#pragma unroll + for (int i = 0; i < Rank; ++i) { + auto fast_divmoder = config.divmoders[i].Divmod(index_output); + index_output = fast_divmoder.val[0]; + index_src += fast_divmoder.val[1] * config.strides[i]; + } + dst[nx] = src[index_src]; + } +} /** * @brief Initialize register with data index. * diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h index a18fc7cbb3119..eb25632378a58 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -21,6 +21,39 @@ namespace phi { namespace kps { namespace details { +enum class OptType { // Optimize type of calc after input shape compressed + CanNotOptimize = -1, // can not optimize, broadcast first + N_1, // just like {1} op {100} or {100} op {1} + MN_N, // just like {100} op {3, 100} or {3, 100} op {100} + MN_M, // just like {3} op {3, 100} or {3, 100} op {3} + MNK_1N1, // just like {3} op {2, 3, 100} or {2, 3, 100} op {3} + MNK_M1K, // just like {2, 1, 100} op {2, 3, 100} or {2, 3, 100} op {2, 1, + // 100} +}; + +// Rules to determine whether dimensions can be merged +// rule 0 - xshape[idx] == yshape[idx] +// rule 1 - xshape[idx] == 1 && yshape[idx] != 1 +// rule 2 - xshape[idx] != 1 && yshape[idx] == 1 +static int judge_case(int a, int b) { + if (a == b) { + return 0; + } else if (a == 1 && b != 1) { + return 1; + } else if (a != 1 && b == 1) { + return 2; + } + return -1; +} + +static bool case_is_same(int case_front, int case_back) { + if (case_front == case_back) { + return true; + } else { + return false; + } +} + template struct alignas(sizeof(T) * VecSize) VectorType { T val[VecSize]; @@ -37,11 +70,20 @@ struct BroadcastConfig { int strides_in[phi::DDim::kMaxRank]; int strides_out[phi::DDim::kMaxRank]; int in_dim[phi::DDim::kMaxRank]; + int dim_after_cmp[phi::DDim::kMaxRank]; + int dim_size_after_cmp = 0; + int cmp_res = 0; + OptType cmp_type = OptType::CanNotOptimize; + int m = 1; + int n = 1; + int k = 1; + int buf_len = 0; HOSTDEVICE BroadcastConfig() {} HOSTDEVICE BroadcastConfig(const std::vector& out_dims, const std::vector& in_dims, + const std::vector& another_in_dims, int dim_size) { std::vector strides_in_tmp; std::vector strides_out_tmp; @@ -61,18 +103,187 @@ struct BroadcastConfig { memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int)); memcpy(strides_out, strides_out_tmp.data(), kDims * sizeof(int)); memcpy(in_dim, dim_tmp.data(), kDims * sizeof(int)); + + cmp_res = get_mnk_for_broadcast_ops(in_dims, another_in_dims); + get_opt_type(another_in_dims); + buf_len = get_buf_len(); + } + + int get_buf_len() { + if (cmp_type == OptType::CanNotOptimize) { + return 256; + } + int max_buf_len = 512; + int buf_len = m / 16 * 16; + if (buf_len == 0) { + buf_len = m; + } + return std::min(max_buf_len, buf_len); } __device__ inline int operator()(int index_output) const { int index_src = 0; -#pragma unroll - for (int i = kDims - 1; i >= 0; --i) { - int tmp_index = (index_output / strides_out[i]); - index_output = index_output - tmp_index * strides_out[i]; - index_src += (tmp_index % in_dim[i]) * strides_in[i]; + + switch (cmp_type) { + int div, mod, tmp_index; + case OptType::MNK_M1K: + div = index_output / (m * n); + mod = index_output % (m * n) % m; + index_src = div * m + mod; + break; + case OptType::MNK_1N1: + // index_src = index_output / m % n; + index_src = index_output % (m * n) / m; + break; + case OptType::N_1: + index_src = 0; + break; + case OptType::MN_N: + index_src = index_output / m; + break; + case OptType::MN_M: + index_src = index_output % m; + break; + case OptType::CanNotOptimize: + for (int i = kDims - 1; i >= 0; --i) { + tmp_index = (index_output / strides_out[i]); + index_output = index_output - tmp_index * strides_out[i]; + index_src += (tmp_index % in_dim[i]) * strides_in[i]; + } + break; } return index_src; } + + void get_opt_type(const std::vector& y_dim_after_cmp) { + if (dim_size_after_cmp == 1) { + if (dim_after_cmp[0] == 1 && y_dim_after_cmp[0] != 1) { // {1} op {n} + n = y_dim_after_cmp[0]; + cmp_type = OptType::N_1; + } else if (dim_after_cmp[0] != 1 && + y_dim_after_cmp[0] == 1) { // {n} op {1} + n = dim_after_cmp[0]; + cmp_type = OptType::N_1; + } else { + cmp_type = OptType::CanNotOptimize; // xshape == yshape + } + } + if (dim_size_after_cmp == 2) { + if (dim_after_cmp[0] == 1 && dim_after_cmp[1] != 1 && + y_dim_after_cmp[0] != 1 && + y_dim_after_cmp[1] != 1) { // {n} op {m, n} + m = y_dim_after_cmp[0]; + n = y_dim_after_cmp[1]; + cmp_type = OptType::MN_N; + } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] == 1 && + y_dim_after_cmp[0] != 1 && + y_dim_after_cmp[1] != 1) { // {m} op {m, n} + m = y_dim_after_cmp[0]; + n = y_dim_after_cmp[1]; + cmp_type = OptType::MN_M; + } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 && + y_dim_after_cmp[0] == 1 && + y_dim_after_cmp[1] != 1) { // {m, n} op {n} + m = dim_after_cmp[0]; + n = dim_after_cmp[1]; + cmp_type = OptType::MN_N; + } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 && + y_dim_after_cmp[0] != 1 && + y_dim_after_cmp[1] == 1) { // {m, n} op {m} + m = dim_after_cmp[0]; + n = dim_after_cmp[1]; + cmp_type = OptType::MN_M; + } else { + cmp_type = OptType::CanNotOptimize; + } + } + if (dim_size_after_cmp == 3) { + if (dim_after_cmp[0] == 1 && dim_after_cmp[1] != 1 && + dim_after_cmp[2] == 1 && y_dim_after_cmp[0] != 1 && + y_dim_after_cmp[1] != 1 && + y_dim_after_cmp[2] != 1) { // {1, n, 1} op {m, n, k} + m = y_dim_after_cmp[0]; + n = y_dim_after_cmp[1]; + k = y_dim_after_cmp[2]; + cmp_type = OptType::MNK_1N1; + } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 && + dim_after_cmp[2] != 1 && y_dim_after_cmp[0] == 1 && + y_dim_after_cmp[1] != 1 && + y_dim_after_cmp[2] == 1) { // {m, n, k} op {1, n, 1} + m = dim_after_cmp[0]; + n = dim_after_cmp[1]; + k = dim_after_cmp[2]; + cmp_type = OptType::MNK_1N1; + } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] == 1 && + dim_after_cmp[2] != 1 && y_dim_after_cmp[0] != 1 && + y_dim_after_cmp[1] != 1 && + y_dim_after_cmp[2] != 1) { // {m, 1, k} op {m, n, k} + m = y_dim_after_cmp[0]; + n = y_dim_after_cmp[1]; + k = y_dim_after_cmp[2]; + cmp_type = OptType::MNK_M1K; + } else if (dim_after_cmp[0] != 1 && dim_after_cmp[1] != 1 && + dim_after_cmp[2] != 1 && y_dim_after_cmp[0] != 1 && + y_dim_after_cmp[1] == 1 && + y_dim_after_cmp[2] != 1) { // {m, n, k} op {m, 1, k} + m = dim_after_cmp[0]; + n = dim_after_cmp[1]; + k = dim_after_cmp[2]; + cmp_type = OptType::MNK_M1K; + } else { + cmp_type = OptType::CanNotOptimize; + } + } + } + + int get_mnk_for_broadcast_ops(const std::vector& xshape, + const std::vector& yshape) { + int idx = 0; + int cmp_x = 0; + int cmp_y = 0; + bool is_same = false; + std::vector xshape_after_remove_ones = xshape; + std::vector yshape_after_remove_ones = yshape; + // first step: remove excess ones + std::vector::iterator x_iter = xshape_after_remove_ones.begin(); + std::vector::iterator y_iter = yshape_after_remove_ones.begin(); + for (; x_iter != xshape_after_remove_ones.end();) { + if (*x_iter == 1 && *y_iter == 1) { + x_iter = xshape_after_remove_ones.erase(x_iter); + y_iter = yshape_after_remove_ones.erase(y_iter); + } else { + x_iter++; + y_iter++; + } + } + // second step: compress dims + int after_cmp_idx = 0; + for (int i = 0; i < 3; i++) { + cmp_x = xshape_after_remove_ones[idx]; + cmp_y = yshape_after_remove_ones[idx]; + while ((idx + 1) < xshape_after_remove_ones.size()) { + is_same = case_is_same(judge_case(xshape_after_remove_ones[idx], + yshape_after_remove_ones[idx]), + judge_case(xshape_after_remove_ones[idx + 1], + yshape_after_remove_ones[idx + 1])); + if (is_same) { + cmp_x = cmp_x * xshape_after_remove_ones[idx + 1]; + cmp_y = cmp_y * yshape_after_remove_ones[idx + 1]; + idx++; + } else { + break; + } + } + idx = idx + 1; + dim_after_cmp[after_cmp_idx] = cmp_x; + after_cmp_idx++; + if (idx == xshape_after_remove_ones.size()) { + dim_size_after_cmp = after_cmp_idx; + return 0; + } + } + return -1; // can not compress dims + } }; #pragma pack() @@ -199,6 +410,14 @@ __device__ __inline__ void Init(T* dst, T init_data) { } } +template +__device__ __inline__ void Init(T* dst, T init_data, int read_lens) { +#pragma unroll + for (int i = 0; i < read_lens; i++) { + dst[i] = init_data; + } +} + /** * The difference from the above function is that * it supports different data types of inputs. @@ -251,6 +470,26 @@ __device__ __inline__ void ReadData(T* dst, } } +template +__device__ __inline__ void ReadData(T* dst, + const T _global_ptr_* src, + int num, + int read_lens) { + int thread_offset = core_id() * read_lens; + __local__ T in_temp[1]; + if (IsBoundary) { // core_num() * read_lens > num +#pragma unroll + for (int idx = 0; idx < read_lens; ++idx) { + if (idx + thread_offset < num) { + GM2LM(src + thread_offset + idx, in_temp, sizeof(T)); + dst[idx] = in_temp[0]; + } + } + } else { // core_num() * read_lens < num + GM2LM(src + thread_offset, dst, read_lens * sizeof(T)); + } +} + /** * @brief Read 1D data from global memory to register. The difference * from the above function is that it supports different data types of inputs. @@ -479,10 +718,32 @@ __device__ __forceinline__ void ReadDataReduce( * size: The current block needs to load size elements continuously. */ +template +__device__ void WriteData(T _global_ptr_* dst, + const T* src, + int num, + int read_lens) { + int thread_offset = core_id() * read_lens; + __local__ T in_temp[1]; + + if (IsBoundary) { // core_num() * read_lens > num +#pragma unroll + for (int idx = 0; idx < read_lens; ++idx) { + if (idx + thread_offset < num) { + in_temp[0] = src[idx]; + LM2GM(in_temp, dst + idx + thread_offset, sizeof(T)); + } + } + } else { // core_num() * read_lens < num + LM2GM(src, dst + thread_offset, read_lens * sizeof(T)); + } +} + template __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) { int thread_offset = core_id() * NX; __local__ T in_temp[1]; + if (IsBoundary) { // core_num() * NX > num #pragma unroll for (int idx = 0; idx < NX; ++idx) { @@ -675,6 +936,331 @@ __device__ __inline__ void ReadDataBc( } } +/** + * @brief Read data from global memory to local memory with broadcast + * {m, 1, k}-> {m, n, k} form. + * + * @template paraments + * T: Data type of register. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * src: The original input data pointer of kernel. + * thread_offset: The data offset of this thread. + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * read_lens: The number of data continuously loaded by each thread. + */ +template +__device__ __inline__ void ReadDataBcM1kMnk( + T* dst, + const T _global_ptr_* src, + int thread_offset, + const details::BroadcastConfig& config, + int read_lens) { + int index_output = thread_offset; + int index_base = config(index_output); + int m = config.m; + int n = config.n; + + int m_pos = index_base % m; + if ((m - m_pos) < read_lens) { + int last_col = m - m_pos; + GM2LM(src + index_base, dst, last_col * sizeof(T)); + int n_pos = index_output % (m * n) / m; + int next_part_index = 0; + if (n_pos != config.n - 1) { + next_part_index = index_base / m * m; + } else { + next_part_index = (index_base / m + 1) * m; + } + GM2LM(src + next_part_index, + dst + last_col, + (read_lens - last_col) * sizeof(T)); + } else { + GM2LM(src + index_base, dst, read_lens * sizeof(T)); + } +} + +/** + * @brief Read data from global memory to local memory with broadcast + * {m, 1}-> {m, n} form. + * + * @template paraments + * T: Data type of register. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * src: The original input data pointer of kernel. + * thread_offset: The data offset of this thread. + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * read_lens: The number of data continuously loaded by each thread. + */ +template +__device__ __inline__ void ReadDataBcM1Mn( + T* dst, + const T _global_ptr_* src, + int thread_offset, + const details::BroadcastConfig& config, + int read_lens) { + int index_output = thread_offset; + int index_base = config(index_output); + int m = config.m; + int n = config.n; + + int m_pos = index_base % m; + if ((m - m_pos) < read_lens) { + int last_col = m - m_pos; + GM2LM(src + index_base, dst, last_col * sizeof(T)); + GM2LM(src, dst + last_col, (read_lens - last_col) * sizeof(T)); + } else { + GM2LM(src + index_base, dst, read_lens * sizeof(T)); + } +} + +/** + * @brief Read data from global memory to local memory with broadcast + * {1, n}-> {m, n} form. + * + * @template paraments + * T: Data type of register. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * src: The original input data pointer of kernel. + * thread_offset: The data offset of this thread. + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * read_lens: The number of data continuously loaded by each thread. + */ +template +__device__ __inline__ void ReadDataBc1NMn( + T* dst, + const T _global_ptr_* src, + int thread_offset, + const details::BroadcastConfig& config, + int read_lens) { + int index_output = thread_offset; + int index_base = config(index_output); + int m = config.m; + int n = config.n; + T in_temp; + + int m_pos = index_output % m; + if ((m - m_pos) < read_lens) { + int last_col = m - m_pos; + GM2LM(src + index_base, &in_temp, sizeof(T)); + for (int i = 0; i < last_col; i++) { + dst[i] = in_temp; + } + GM2LM(src + index_base + 1, &in_temp, sizeof(T)); + for (int i = 0; i < read_lens - last_col; i++) { + dst[last_col + i] = in_temp; + } + } else { + GM2LM(src + index_base, &in_temp, sizeof(T)); + for (int i = 0; i < read_lens; i++) { + dst[i] = in_temp; + } + } +} + +/** + * @brief Read data from global memory to local memory with broadcast + * {1, n, 1}-> {m, n, k} form. + * + * @template paraments + * T: Data type of register. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * src: The original input data pointer of kernel. + * thread_offset: The data offset of this thread. + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * read_lens: The number of data continuously loaded by each thread. + */ +template +__device__ __inline__ void ReadDataBc1N1Mnk( + T* dst, + const T _global_ptr_* src, + int thread_offset, + const details::BroadcastConfig& config, + int read_lens) { + int index_output = thread_offset; + int index_base = config(index_output); + int m = config.m; + int n = config.n; + T in_temp; + + int m_pos = index_output % m; + if ((m - m_pos) < read_lens) { + int last_col = m - m_pos; + GM2LM(src + index_base, &in_temp, sizeof(T)); + for (int i = 0; i < last_col; i++) { + dst[i] = in_temp; + } + int n_pos = index_output % (m * n) / m; + int next_part_index = 0; + if (n_pos != n - 1) { + next_part_index = n_pos + 1; + } else { + next_part_index = 0; + } + GM2LM(src + next_part_index, &in_temp, sizeof(T)); + for (int i = 0; i < read_lens - last_col; i++) { + dst[last_col + i] = in_temp; + } + } else { + GM2LM(src + index_base, &in_temp, sizeof(T)); + for (int i = 0; i < read_lens; i++) { + dst[i] = in_temp; + } + } +} + +/** + * @brief Read data from global memory to local memory with broadcast + * {1}-> {n} form. + * + * @template paraments + * T: Data type of register. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * src: The original input data pointer of kernel. + * thread_offset: The data offset of this thread. + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * read_lens: The number of data continuously loaded by each thread. + */ +template +__device__ __inline__ void ReadDataBc1N( + T* dst, + const T _global_ptr_* src, + int thread_offset, + const details::BroadcastConfig& config, + int read_lens) { + int index_output = thread_offset; + int index_base = config(index_output); + T in_temp; + + GM2LM(src + index_base, &in_temp, sizeof(T)); + for (int i = 0; i < read_lens; i++) { + dst[i] = in_temp; + } +} + +/** + * @brief Read data from global memory to local memory with broadcast + * form which can not compress. + * + * @template paraments + * T: Data type of register. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * src: The original input data pointer of kernel. + * thread_offset: The data offset of this thread. + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * total_num_output: Total number of original output. + * read_lens: The number of data continuously loaded by each thread. + */ +template +__device__ __inline__ void ReadDataBcCanNotCmp( + T* dst, + const T _global_ptr_* src, + int thread_offset, + const details::BroadcastConfig& config, + int total_num_output, + int read_lens) { + int index_output = thread_offset; + int index_base = config(index_output); + T in_temp; + int cache_size = 256; + __local__ T src_temp[cache_size]; + GM2LM(src + index_base, src_temp, cache_size * sizeof(T)); + + for (int nx = 0; nx < read_lens; ++nx) { + index_output = thread_offset + nx; + if (IsBoundary) { + if (index_output >= total_num_output) { + break; + } + } + int index_src = config(index_output); + if (index_src >= index_base && index_src < index_base + cache_size) { + in_temp = src_temp[index_src - index_base]; + } else { + GM2LM(src + index_src, &in_temp, sizeof(T)); + } + dst[nx] = in_temp; + } +} + +/** + * @brief Read 1D data from global memory to register with broadcast form. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For xpu, + * core_id() is used as the index. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x core_num(), boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The original input data pointer of kernel. + * block_offset: The data offset of this block, core_num() * blockIdx.x * NX; + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * read_lens: The number of data continuously loaded by each thread. + * total_num_output: Total number of original output. + */ +template +__device__ __inline__ void ReadDataBc( + T* dst, + const T _global_ptr_* src, + uint32_t block_offset, + const details::BroadcastConfig& config, + int total_num_output, + int read_lens) { + int thread_offset = block_offset + core_id() * read_lens; + + if (config.cmp_type == details::OptType::MNK_M1K) { + ReadDataBcM1kMnk(dst, src, thread_offset, config, read_lens); + } else if (config.cmp_type == details::OptType::N_1) { + ReadDataBc1N(dst, src, thread_offset, config, read_lens); + } else if (config.cmp_type == details::OptType::MN_M) { + ReadDataBcM1Mn(dst, src, thread_offset, config, read_lens); + } else if (config.cmp_type == details::OptType::MN_N) { + ReadDataBc1NMn(dst, src, thread_offset, config, read_lens); + } else if (config.cmp_type == details::OptType::MNK_1N1) { + ReadDataBc1N1Mnk(dst, src, thread_offset, config, read_lens); + } else { + ReadDataBcCanNotCmp( + dst, src, thread_offset, config, total_num_output, read_lens); + } +} + /** * @brief Initialize register with data index. * diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h index b5a1e88acc32b..ea5846c3a2418 100644 --- a/paddle/phi/kernels/primitive/kernel_primitives.h +++ b/paddle/phi/kernels/primitive/kernel_primitives.h @@ -46,6 +46,7 @@ #define KPStream gpuStream_t #define KPDevice phi::GPUContext #define _ptr_ +#define __simd__ #define THREAD_ID_X threadIdx.x #define THREAD_ID_Y threadIdx.y diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index e54e39f5541d5..c19bf67be2611 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" @@ -203,38 +203,19 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } // 4. scatter - // x_grad->ResizeAndAllocate(x.non_zero_elements().dims()); - DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW); - DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); - DenseTensor unique_key = phi::Empty( - dev_ctx, - DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), - {rulebook_len}, - DataLayout::NCHW)); - DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); - - SortedAndUniqueIndex(dev_ctx, - rulebook_ptr + rulebook_len, - rulebook_len, - &out_index, - &unique_key, - &unique_value); - config = phi::backends::gpu::GetGpuLaunchConfig1D( dev_ctx, rulebook_len * in_channels, 1); - phi::funcs::sparse::ScatterKernel<<>>( + phi::funcs::ScatterCUDAKernel<<>>( d_x_features_ptr, - unique_value.data(), - out_index.data(), - x.nnz(), + rulebook_ptr + rulebook_len, + x_grad_values_ptr, rulebook_len, in_channels, - x_grad_values_ptr, - subm); + false); } template diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index 13a5a6fd4a449..c760c966b0647 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -95,6 +95,16 @@ KernelSignature ElementwiseFloorDivOpArgumentMapping( return KernelSignature("floor_divide_raw", {"X", "Y"}, {"axis"}, {"Out"}); } +KernelSignature ElementwiseHeavisideOpArgumentMapping( + const ArgumentMappingContext& ctx) { + int axis = paddle::any_cast(ctx.Attr("axis")); + if (axis == -1) { + return KernelSignature("elementwise_heaviside", {"X", "Y"}, {}, {"Out"}); + } + return KernelSignature( + "elementwise_heaviside_raw", {"X", "Y"}, {"axis"}, {"Out"}); +} + KernelSignature ElementwisePowOpArgumentMapping( const ArgumentMappingContext& ctx) { int axis = paddle::any_cast(ctx.Attr("axis")); @@ -208,6 +218,15 @@ KernelSignature ElementwiseMinGradOpArgumentMapping( return KernelSignature( "minimum_grad", {"X", "Y", "Out@GRAD"}, {"axis"}, {"X@GRAD", "Y@GRAD"}); } + +KernelSignature ElementwiseHeavisideGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("elementwise_heaviside_grad", + {"X", "Y", "Out@GRAD"}, + {"axis"}, + {"X@GRAD", "Y@GRAD"}); +} + KernelSignature ElementwisePowGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("elementwise_pow_grad", @@ -258,6 +277,8 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mod, phi::ElementwiseModOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_floordiv, phi::ElementwiseFloorDivOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside, + phi::ElementwiseHeavisideOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_pow, phi::ElementwisePowOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad, @@ -292,5 +313,7 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_max_grad, phi::ElementwiseMaxGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_min_grad, phi::ElementwiseMinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_heaviside_grad, + phi::ElementwiseHeavisideGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_pow_grad, phi::ElementwisePowGradOpArgumentMapping); diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index f4a09436d86ce..36154b23f3f12 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -57,7 +57,6 @@ rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT -if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto if not defined WITH_GPU set WITH_GPU=ON if not defined WITH_MKL set WITH_MKL=ON if not defined WITH_AVX set WITH_AVX=ON @@ -229,6 +228,7 @@ set MSVC_STATIC_CRT=OFF set ON_INFER=OFF set WITH_TENSORRT=ON set WITH_INFERENCE_API_TEST=OFF +if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto call :cmake || goto cmake_error call :build || goto build_error @@ -243,6 +243,7 @@ set WITH_GPU=OFF set WITH_AVX=OFF set MSVC_STATIC_CRT=ON set ON_INFER=OFF +if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto call :cmake || goto cmake_error call :build || goto build_error @@ -260,6 +261,7 @@ set ON_INFER=ON set WITH_TENSORRT=ON set WITH_INFERENCE_API_TEST=ON set WITH_ONNXRUNTIME=ON +if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto call :cmake || goto cmake_error call :build || goto build_error @@ -274,7 +276,7 @@ rem ------Build windows avx whl package------ :CASE_build_avx_whl set WITH_AVX=ON set ON_INFER=OFF -set CUDA_ARCH_NAME=All +if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All call :cmake || goto cmake_error call :build || goto build_error @@ -285,7 +287,7 @@ rem ------Build windows no-avx whl package------ :CASE_build_no_avx_whl set WITH_AVX=OFF set ON_INFER=OFF -set CUDA_ARCH_NAME=All +if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All call :cmake || goto cmake_error call :build || goto build_error @@ -296,7 +298,8 @@ rem ------Build windows inference library------ :CASE_build_inference_lib set ON_INFER=ON set WITH_PYTHON=OFF -set CUDA_ARCH_NAME=All +if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All + python %work_dir%\tools\remove_grad_op_and_kernel.py if %errorlevel% NEQ 0 exit /b 1 diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0e1d0660322bd..3a2c51fe72b20 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -959,7 +959,7 @@ function check_whl_size() { whldiffSize=`echo $(($pr_whl_size - $dev_whl_size))` if [ ${whldiffSize} -gt 10 ]; then - approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22334008 22361972` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then @@ -1053,13 +1053,13 @@ function generate_api_spec() { function check_approvals_of_unittest() { set +x - if [ "$GIT_PR_ID" == "" ]; then + if [ "$GITHUB_API_TOKEN" == "" ] || [ "$GIT_PR_ID" == "" ]; then return 0 fi # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420 check_times=$1 if [ $check_times == 1 ]; then - approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` if [ "${approval_line}" != "" ]; then APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" @@ -1073,7 +1073,7 @@ function check_approvals_of_unittest() { elif [ $check_times == 2 ]; then unittest_spec_diff=`python ${PADDLE_ROOT}/tools/diff_unittest.py ${PADDLE_ROOT}/paddle/fluid/UNITTEST_DEV.spec ${PADDLE_ROOT}/paddle/fluid/UNITTEST_PR.spec` if [ "$unittest_spec_diff" != "" ]; then - approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then @@ -1106,7 +1106,7 @@ function check_approvals_of_unittest() { EOF if [ `echo "20 < $AllDiffSize"|bc` -eq 1 ] ; then - approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 39303645 328693` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then @@ -2853,7 +2853,7 @@ function test_op_benchmark() { # The PR will pass quickly when get approval from specific person. # Xreki 12538138, luotao1 6836917, ZzSean 32410583 set +x - approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) + approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ "${approval_line}" != "" ]; then APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917) echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" @@ -2905,11 +2905,11 @@ function summary_check_problems() { function reuse_so_cache() { get_html="https://api.github.com/repos/PaddlePaddle/Paddle" - curl -X GET ${get_html}/commits >tmp.txt + curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt merge_commit=`grep "sha" tmp.txt| awk -F \" 'NR==1{print $(NF-1)}'| sed 's# ##g'` - curl -X GET ${get_html}/commits/${merge_commit} >tmp.txt + curl -X GET ${get_html}/commits/${merge_commit} -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt merge_pr=`grep -oP -m 1 '(#[0-9]*)' tmp.txt| sed 's/#//g'` - curl -X GET ${get_html}/pulls/${merge_pr}/commits >tmp.txt + curl -X GET ${get_html}/pulls/${merge_pr}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt pr_commit=`grep "sha" tmp.txt |tail -3|head -1|awk -F : '{print $NF}'|sed 's#"##g'|sed 's#,##g'| sed 's# ##g'` set +e wget -q https://xly-devops.bj.bcebos.com/PR/Paddle/${merge_pr}/${pr_commit}/workspace/Paddle/build/proto_so.tar.gz @@ -3003,7 +3003,7 @@ function check_coverage_build() { set +x if [ ${diff_coverage_build_size} -gt 3 ]; then - approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 29832297 6836917 43953930` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index cb0135d9b4c29..8c2ec1acf072a 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -269,6 +269,7 @@ from .tensor.math import fmin # noqa: F401 from .tensor.math import inner # noqa: F401 from .tensor.math import outer # noqa: F401 +from .tensor.math import heaviside # noqa: F401 from .tensor.math import frac # noqa: F401 from .tensor.random import bernoulli # noqa: F401 @@ -635,4 +636,5 @@ 'renorm', 'take_along_axis', 'put_along_axis', + 'heaviside', ] diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py index 5132f23079f1f..96a94d898467f 100644 --- a/python/paddle/amp/auto_cast.py +++ b/python/paddle/amp/auto_cast.py @@ -54,25 +54,25 @@ def auto_cast(enable=True, with paddle.amp.auto_cast(): conv = conv2d(data) - print(conv.dtype) # FP16 + print(conv.dtype) # paddle.float32 with paddle.amp.auto_cast(enable=False): conv = conv2d(data) - print(conv.dtype) # FP32 + print(conv.dtype) # paddle.float32 with paddle.amp.auto_cast(custom_black_list={'conv2d'}): conv = conv2d(data) - print(conv.dtype) # FP32 + print(conv.dtype) # paddle.float32 a = paddle.rand([2,3]) b = paddle.rand([2,3]) with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}): c = a + b - print(c.dtype) # FP16 + print(c.dtype) # paddle.float32 with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}, level='O2'): d = a + b - print(d.dtype) # FP16 + print(d.dtype) # paddle.float32 """ return amp_guard(enable, custom_white_list, custom_black_list, level, dtype) diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 408a1fdaafeef..8c286c02015bf 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -123,6 +123,19 @@ def merge_process_mesh_two(pm1, pm2): return merged_process_mesh +def _validate_dims_mapping(dims_mapping, process_mesh): + if dims_mapping is None: + return False + for i in range(len(dims_mapping)): + if dims_mapping[i] < -1 or dims_mapping[i] >= len( + process_mesh.topology): + return False + for i in range(len(process_mesh.topology)): + if dims_mapping.count(i) > 1: + return False + return True + + class Completer: def __init__(self, dist_context): assert dist_context is not None @@ -161,6 +174,9 @@ def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True): dims_mapping_list.append(tensor_dims_mapping) compatible_dims_mapping = compute_compatible_dims_mapping( dims_mapping_list) + if not _validate_dims_mapping(compatible_dims_mapping, + tensor_dist_attr.process_mesh): + return False if (compatible_dims_mapping is not None) and \ (compatible_dims_mapping != tensor_dims_mapping): tensor_dist_attr.dims_mapping = compatible_dims_mapping @@ -182,6 +198,9 @@ def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True): dims_mapping_list.append(tensor_dims_mapping) compatible_dims_mapping = compute_compatible_dims_mapping( dims_mapping_list) + if not _validate_dims_mapping(compatible_dims_mapping, + tensor_dist_attr.process_mesh): + return False if (compatible_dims_mapping is not None) and \ (compatible_dims_mapping != tensor_dims_mapping): tensor_dist_attr.dims_mapping = compatible_dims_mapping @@ -196,10 +215,12 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True): op_desc = op_node.op() if op_desc.type() == "create_py_reader" \ or op_desc.type() == "create_double_buffer_reader" \ + or op_desc.type() == "while" \ or op_desc.type() == "read": return False dist_op = self._dist_context.get_dist_op_for_graph(op_node) op_dist_attr = dist_op.dist_attr + original_op_dist_attr = copy.deepcopy(op_dist_attr) if fwd: for tensor_node in op_node.inputs: if tensor_node.is_var() and tensor_node.var() is not None: @@ -223,18 +244,34 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True): tensor_desc.name(), compatible_dims_mapping) changed = True # Find the most compatible implemenetations from the distributed operator - op_dist_impl = find_best_compatible_distributed_operator_impl( + op_dist_impls = find_best_compatible_distributed_operator_impl( dist_op, fwd=True) - if op_dist_impl is not None: - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" + if op_dist_impls is not None: + not_compatible = True + backup_op_dist_attr = copy.deepcopy(op_dist_attr) + backup_changed = changed + for op_dist_impl in op_dist_impls: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + # op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx + not_compatible = False + break else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + dist_op.dist_attr = backup_op_dist_attr + changed = backup_changed + if not_compatible: + dist_op.dist_attr = original_op_dist_attr + changed = False + else: + dist_op.dist_attr = original_op_dist_attr + changed = False else: for tensor_node in op_node.outputs: if tensor_node.is_var() and tensor_node.var() is not None: @@ -258,18 +295,35 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True): tensor_desc.name(), compatible_dims_mapping) changed = True # Find the most compatible implemenetations from the distributed operator - op_dist_impl = find_best_compatible_distributed_operator_impl( + op_dist_impls = find_best_compatible_distributed_operator_impl( dist_op, fwd=False) - if op_dist_impl is not None: - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" + if op_dist_impls is not None: + not_compatible = True + backup_op_dist_attr = copy.deepcopy(op_dist_attr) + backup_changed = changed + for op_dist_impl in op_dist_impls: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + not_compatible = False + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + # op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx + not_compatible = False + break else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + dist_op.dist_attr = backup_op_dist_attr + changed = backup_changed + if not_compatible: + dist_op.dist_attr = original_op_dist_attr + changed = False + else: + dist_op.dist_attr = original_op_dist_attr + changed = False return changed def _update_dims_mapping_between_graphs(self): @@ -279,17 +333,22 @@ def _update_dims_mapping_between_graphs(self): parent_node) child_node_dist_attr = self._dist_context.get_dist_attr_for_graph( child_node) + if parent_node_dist_attr.process_mesh != child_node_dist_attr.process_mesh: + continue parent_node_dims_mapping = parent_node_dist_attr.dims_mapping child_node_dims_mapping = child_node_dist_attr.dims_mapping compatible_dims_mapping = compute_compatible_dims_mapping( [parent_node_dims_mapping, child_node_dims_mapping]) + if not _validate_dims_mapping(compatible_dims_mapping, + parent_node_dist_attr.process_mesh): + return False if (compatible_dims_mapping is not None) \ and (compatible_dims_mapping != parent_node_dims_mapping): parent_node_dist_attr.dims_mapping = compatible_dims_mapping changed = True if (compatible_dims_mapping is not None) \ and (compatible_dims_mapping != child_node_dims_mapping): - parent_node_dist_attr.dims_mapping = compatible_dims_mapping + child_node_dist_attr.dims_mapping = compatible_dims_mapping changed = True return changed @@ -351,7 +410,7 @@ def _update_process_mesh_by_nearest(self, op_node, nearest_op_node): if compatible_process_mesh is not None \ and tensor_dist_attr.process_mesh != compatible_process_mesh: tensor_dist_attr.process_mesh = compatible_process_mesh - # Set the process mesh of the op node's outputs + # Set the process mesh of the op node's outputs for tensor_node in op_node.outputs: if tensor_node.is_var() and tensor_node.var() is not None: tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( @@ -389,7 +448,8 @@ def _find_nodes_related_to_cond(source_node): if _node_id(cur) in visited: continue # TODO: need more restrictions - for node in cur.inputs: + neighbors = cur.inputs + cur.outputs + for node in neighbors: if node.is_var() and node.var() is not None: if node.var().type() != core.VarDesc.VarType.READER \ and len(node.var().shape()) == 1: @@ -421,10 +481,29 @@ def _find_nodes_related_to_cond(source_node): visited.add(_node_id(cur)) return related_nodes + def _make_dims_mapping_replicate(dist_attr): + if isinstance(dist_attr, TensorDistributedAttribute): + for i, _ in enumerate(dist_attr.dims_mapping): + dist_attr.dims_mapping[i] = -1 + if isinstance(dist_attr, OperatorDistributedAttribute): + for arg_name in dist_attr.inputs_dist_attrs.keys(): + new_dims_mapping = [] + dims_mapping = dist_attr.get_input_dims_mapping(arg_name) + for _ in dims_mapping: + new_dims_mapping.append(-1) + dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping) + for arg_name in dist_attr.outputs_dist_attrs.keys(): + new_dims_mapping = [] + dims_mapping = dist_attr.get_output_dims_mapping(arg_name) + for _ in dims_mapping: + new_dims_mapping.append(-1) + dist_attr.set_output_dims_mapping(arg_name, + new_dims_mapping) + # Amend the process meshes related to while_op for while_op_node, while_op_node_idx in self._while_op_nodes.values(): sub_graph_id = while_op_node.op()._block_attr_id("sub_block") - sub_graph = self._dist_context._serial_graph.get_sub_graph( + sub_graph = self._dist_context.serial_graph.get_sub_graph( sub_graph_id) sub_graph_nodes = list(sub_graph.all_nodes()) while_dist_op = self._dist_context.get_dist_op_for_graph( @@ -440,6 +519,7 @@ def _find_nodes_related_to_cond(source_node): merged_process_mesh = merge_process_mesh_two( merged_process_mesh, dist_attr.process_mesh) while_op_dist_attr.process_mesh = merged_process_mesh + _make_dims_mapping_replicate(while_op_dist_attr) # Step 2: set the related nodes of while_op to the process mesh of while_op # Step 2.1: Find related nodes of cond var the graph of while_op @@ -480,6 +560,7 @@ def _find_nodes_related_to_cond(source_node): tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( node) tensor_dist_attr.process_mesh = merged_process_mesh + _make_dims_mapping_replicate(tensor_dist_attr) # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs @@ -519,6 +600,25 @@ def _find_nodes_related_to_cond(source_node): dist_attr = self._dist_context.get_dist_attr_for_graph( array_node) dist_attr.process_mesh = merged_process_mesh + _make_dims_mapping_replicate(dist_attr) + + def _update_process_mesh_between_graphs(self): + for parent_node, child_node in self._node_pairs_between_graphs: + parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + parent_node) + child_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + child_node) + parent_node_dist_attr.process_mesh = child_node_dist_attr.process_mesh + compatible_process_mesh = compute_compatible_process_mesh([ + parent_node_dist_attr.process_mesh, + child_node_dist_attr.process_mesh + ]) + if compatible_process_mesh is not None \ + and parent_node_dist_attr.process_mesh != compatible_process_mesh: + parent_node_dist_attr.process_mesh = compatible_process_mesh + if compatible_process_mesh is not None \ + and child_node_dist_attr.process_mesh != compatible_process_mesh: + child_node_dist_attr.process_mesh = compatible_process_mesh def _update_process_mesh(self): ordered_op_nodes = self._dist_context._serial_ordered_op_nodes @@ -569,7 +669,7 @@ def _update_process_mesh(self): return None for idx, op_node in enumerate(ordered_op_nodes[ idx_of_first_op_node_has_process_mesh + 1:]): - original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1 + original_idx = idx_of_first_op_node_has_process_mesh + idx + 1 nearest_op_node = ordered_op_nodes[original_idx - 1] nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph( nearest_op_node) @@ -585,6 +685,9 @@ def _update_process_mesh(self): # Step 3: adjust the process meshes for special ops self._update_process_mesh_for_specials() + # Step 4: adjust the process meshes between graphs + self._update_process_mesh_between_graphs() + def _prepare(self): self._while_op_nodes = {} self._array_nodes = {} @@ -620,7 +723,7 @@ def _prepare(self): self._node_pairs_between_graphs.append( (after_node, node)) - def complete_forward_annotation(self, serial_main_program): + def complete_forward_annotation(self, serial_main_program=None): """ Complete annotation for the partial annotated serial_main_program. Arguments: serial_main_program: partial annotated serial_main_program. @@ -628,15 +731,12 @@ def complete_forward_annotation(self, serial_main_program): serial_main_program: completed annotated serial_main_program. """ - # Use the default distribted context for completeion if there is no one - self._dist_context.serial_program = serial_main_program - - # Initialize distributed attributes for all var and op node in serial_main_program - self._dist_context.init_dist_attr_for_program() - # print_program_with_dist_attr(serial_main_program, self._dist_context) + if serial_main_program is None: + serial_main_program = self._dist_context.serial_main_program + else: + self._dist_context.serial_main_program = serial_main_program - # Initialize distributed attributes for all var and op node in graph - self._dist_context.init_dist_attr_for_graph() + self._dist_context.initialize() self._prepare() @@ -646,10 +746,9 @@ def complete_forward_annotation(self, serial_main_program): # Copy the corresponding distributed attribute from graph to serial_main_program self._dist_context.copy_dist_attr_from_graph_to_program() - self._dist_context.clear_dist_info_for_graph() # NOTE:[HighOrderGrad] update vars and ops distributed attribute in high order gradient - self.complete_high_order_grad_annotation(serial_main_program) + self._complete_high_order_grad_annotation(serial_main_program) # Do the validation check and amend some completion self._dist_context.amend_dist_attr_for_program() @@ -658,7 +757,7 @@ def complete_forward_annotation(self, serial_main_program): return serial_main_program - def complete_high_order_grad_annotation(self, serial_main_program): + def _complete_high_order_grad_annotation(self, serial_main_program): """ NOTE: [HighOrderGrad] Complete the annotation of vars and ops only for high order gradient. @@ -818,6 +917,10 @@ def _get_op_by_id(ops, id): def complete_backward_annotation(self, serial_main_program): """Complete the annotation of vars and ops in the backward phase for parallel program.""" + if serial_main_program is None: + serial_main_program = self._dist_context.serial_main_program + else: + self._dist_context.serial_main_program = serial_main_program def _is_grad_var_name(name): if "@GRAD" in name: @@ -1036,8 +1139,12 @@ def _get_op_by_id(ops, id): self._dist_context.set_op_dist_attr_for_program( grad_op, grad_op_dist_attr) - def complete_update_annotation(self, serial_main_program): + def complete_update_annotation(self, serial_main_program=None): """Complete the annotation of vars and ops in the update phase for parallel program.""" + if serial_main_program is None: + serial_main_program = self._dist_context.serial_main_program + else: + self._dist_context.serial_main_program = serial_main_program ops = list(serial_main_program.global_block().ops) vars = serial_main_program.global_block().vars learning_rate_completed = False diff --git a/python/paddle/distributed/auto_parallel/cost/__init__.py b/python/paddle/distributed/auto_parallel/cost/__init__.py index 7bc8a81b79f8e..9ea58d6979527 100644 --- a/python/paddle/distributed/auto_parallel/cost/__init__.py +++ b/python/paddle/distributed/auto_parallel/cost/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License -from .base_cost import OP_COST_FACTORY +from .base_cost import _g_op_cost_factory from .base_cost import Cost from .comm_op_cost import AllreduceSumCost from .comp_op_cost import MatmulV2OpCost diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py index c4ebd836129e2..cb16d522bc9e3 100644 --- a/python/paddle/distributed/auto_parallel/cost/base_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py @@ -19,7 +19,7 @@ "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum" ] NON_COMP_TYPE = ["while"] + COMM_OP_TYPE -OP_COST_FACTORY = {} +_g_op_cost_factory = {} def _parse_op_to_desc(op, dist_context=None): @@ -126,66 +126,136 @@ class CommContext: _instance = None _has_instance = False - def __init__(self, cluster): - if CommContext._has_instance: - return - self.cluster = cluster - self._alpha_base_ring = 8.4 - self._alpha_base_tree = 0 - self._alpha_inter = None - self._alpha_intra - self._beta = {} - def __new__(cls, *args, **kwargs): if cls._instance is None: - cls._instance = super().__new__(cls, *args, **kwargs) + cls._instance = super().__new__(cls) _has_instance = True return cls._instance - @property - def alpha_inter(self): - if self._alpha_inter is None: - if cluster.alpha.inter == "NVL": - self._alpha_inter = 3.4 - elif cluster.alpha.inter == "PHB": - self._alpha_inter = 5.7 - return self._alpha_inter - - @property - def alpha_intra(self): - if self._alpha_intra is None: - if cluster.alpha.intra == "NVL": - self._alpha_intra = 28 - elif cluster.alpha.intra == "PHB": - self._alpha_intra = 28 - return self._alpha_intra - - @property - def alpha_base_ring(self): - return self._alpha_base_ring - - @property - def alpha_base_tree(self): - return self._alpha_base_tree - - def get_beta(self, ranks): + def __init__(self, cluster): + if CommContext._has_instance: + return + self.beta = {} + self.hops = {} + self.cluster = cluster + # if cluster has no info about those vars, it will be set by default + self.base_ring = None + self.base_tree = None + # self.base_inter_ring = None + # self.base_inter_tree = None + self.intra_ring = None + self.intra_tree = None + self.inter_ring = None + self.inter_tree = None + self.switch = None + self._post_init() + + def _post_init(self): + alpha_latency = self.cluster.alpha_latency + if alpha_latency is None: + # set default + self.base_ring = 8.4 + self.base_tree = 0. + # NVL in default + self.intra_ring = 3.4 + self.intra_tree = 28 + # NET in default + self.inter_ring = 9.6 + self.inter_tree = 28 + self.switch = 10.0 + else: + base_ring = alpha_latency.base_ring + self.base_ring = base_ring if base_ring is not None else 8.4 + + base_tree = alpha_latency.base_tree + self.base_tree = base_tree if base_tree is not None else 0. + + intra_ring = alpha_latency.intra_ring + if intra_ring == LinkType.NVL: + self.intra_ring = 3.4 + elif intra_ring == LinkType.PHB: + self.intra_ring = 5.7 + elif intra_ring is not None: + self.intra_ring = intra_ring + else: + # NVL Default + self.intra_ring = 3.4 + + intra_tree = alpha_latency.intra_tree + if intra_tree == LinkType.NVL: + self.intra_tree = 28 + elif intra_tree == LinkType.PHB: + self.intra_tree = 28 + elif intra_tree is not None: + self.intra_tree = intra_tree + else: + # NVL Default + self.intra_tree = 28 + + inter_ring = alpha_latency.inter_ring + if inter_ring == LinkType.NET: + self.inter_ring = 9.6 + elif inter_ring is not None: + self.inter_ring = inter_ring + else: + # NET Default + self.inter_ring = 9.6 + + inter_tree = alpha_latency.inter_tree + if inter_tree == LinkType.NET: + self.inter_tree = 28 + elif inter_tree is not None: + self.inter_tree = inter_tree + else: + # NET Default + self.inter_tree = 28 + + switch = alpha_latency.switch + self.switch = switch if switch is not None else 10 + + assert self.base_ring is not None + assert self.base_tree is not None + assert self.intra_ring is not None + assert self.intra_tree is not None + assert self.inter_ring is not None + assert self.inter_tree is not None + assert self.switch is not None + + def get_max_beta(self, ranks): + # NOTE: Get beta by ring, even in the case of tree such as tree broadcast + ranks = self.cluster.convert_rank_to_device_id(ranks) key = ','.join(map(str, sorted(ranks))) max_beta = None - if key in self._beta.keys: - max_beta = self._beta[key] + if key in self.beta: + max_beta = self.beta[key] else: for i in range(len(ranks)): for j in range(i + 1, len(ranks)): - if min_beta == None: - min_beta = cluster.get_beta(ranks[i], ranks[j]) + forward_order_beta = self.cluster.get_beta(ranks[i], + ranks[j]) + backward_order_beta = self.cluster.get_beta(ranks[j], + ranks[i]) + beta = forward_order_beta if forward_order_beta > backward_order_beta else backward_order_beta + if max_beta == None: + max_beta = beta else: - beta = cluster.get_beta(ranks[i], ranks[j]) if beta > max_beta: max_beta = beta - self._beta[key] = max_beta + self.beta[key] = max_beta return max_beta + def get_hops(self, ranks): + key = ','.join(map(str, sorted(ranks))) + hops = 0 + for i in range(len(ranks)): + for j in range(i + 1, len(ranks)): + hop = self.cluster.get_hop(ranks[i], ranks[j]) + hops += hop + self.hops[key] = hops + + return hops + class Cost: def __init__(self, time=0, memory=0, flops=0): @@ -198,11 +268,13 @@ def _check_time(self, val): def _check_memory(self, val): assert isinstance( - val, int) and val >= 0, "Memory must be int and greater than 0." + val, + int) and val >= 0, "Memory must be int and greater than equal to 0." def _check_flops(self, val): assert isinstance( - val, int) and val >= 0, "FLOPs must be int and greater than 0." + val, + int) and val >= 0, "FLOPs must be int and greater than equal to 0." @property def time(self): @@ -254,7 +326,7 @@ def __init__(self, op=None, op_desc=None): op_desc is not None) self._op = op self._op_desc = op_desc - self._cost = self.calc_cost() + self._cost = None @property def op(self): @@ -264,6 +336,18 @@ def op(self): def op_desc(self): return self._op_desc + @property + def time(self): + return self.cost.time + + @property + def memory(self): + return self.cost.memory + + @property + def flops(self): + return self.cost.flops + @property def cost(self): return self._cost @@ -284,6 +368,40 @@ def calc_cost(self): cost = Cost(time, memory, flops) return cost + def __add__(self, rhs): + assert isinstance(rhs, (OpCost, Cost)) + time = 0 + memory = 0 + flops = 0 + if isinstance(rhs, OpCost): + time = self.cost.time + rhs.cost.time + memory = self.cost.memory + rhs.cost.memory + flops = self.cost.flops + rhs.cost.flops + assert (time >= 0 and memory >= 0 and flops >= 0) + elif isinstance(rhs, Cost): + time = self.time + rhs.time + memory = self.memory + rhs.memory + flops = self.flops + rhs.flops + assert (time >= 0 and memory >= 0 and flops >= 0) + return Cost(time, memory, flops) + + def __sub__(self, rhs): + assert isinstance(rhs, (OpCost, Cost)) + time = 0 + memory = 0 + flops = 0 + if isinstance(rhs, OpCost): + time = self.cost.time - rhs.cost.time + memory = self.cost.memory - rhs.cost.memory + flops = self.cost.flops - rhs.cost.flops + assert (time >= 0 and memory >= 0 and flops >= 0) + elif isinstance(rhs, Cost): + time = self.time - rhs.time + memory = self.memory - rhs.memory + flops = self.flops - rhs.flops + assert (time >= 0 and memory >= 0 and flops >= 0) + return Cost(time, memory, flops) + class CommOpCost(OpCost): OP_TYPE = "COMM" @@ -292,11 +410,83 @@ def __init__(self, op=None, op_desc=None, comm_context=None): super(CommOpCost, self).__init__(op=op, op_desc=op_desc) self._check_comm_op_type() self._comm_context = comm_context + self._group_ranks = None + self._comm_count = None + self._hops = None + self._rank_count = len(self.group_ranks) + self._machine_count = None + self._cost = self.calc_cost() @property def comm_context(self): return self._comm_context + @property + def comm_count(self): + if self._comm_count is None: + dtype = None + shape = None + if self.op is not None: + vars = self.op.block.vars + # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided + var_name = self.op.input("X")[0] + var = vars[var_name] + dtype = var.dtype + shape = var.shape + elif self.op_desc is not None: + dtype = self.op_desc["inputs"]["X"][0][0] + shape = self.op_desc["inputs"]["X"][0][1] + + factor = None + if dtype == paddle.float32 or dtype == paddle.int32: + factor = 4 + elif dtype == paddle.int64: + factor = 8 + elif dtype == paddle.uint8: + factor = 1 + elif dtype == paddle.float16: + factor = 2 + else: + raise TypeError("This dtype {} is not supported now".format( + dtype)) + comm_count = reduce(lambda x, y: x * y, shape) * factor + self._comm_count = comm_count + + return self._comm_count + + @property + def rank_count(self): + return self._rank_count + + @property + def machine_count(self): + if self._machine_count is None: + cluster = self._comm_context.cluster + self._machine_count = cluster.get_involved_machine_count( + self.group_ranks) + return self._machine_count + + @property + def hops(self): + if self._hops is None: + self._hops = self.comm_context.get_hops(self.group_ranks) + return self._hops + + @property + def group_ranks(self): + if self._group_ranks is None: + if self.op_desc is not None: + self._group_ranks = self.op_desc["group_ranks"] + elif self.op is not None: + ring_id = op.attrs("ring_id") + process_group = get_process_group(ring_id) + if process_group is None: + raise ValueError( + "There not exists process group whose ring_id is {}.". + format(ring_id)) + self._group_ranks = process_group.ranks + return self._group_ranks + @classmethod def _check_comm_op_type(cls): if cls.OP_TYPE != "COMM": @@ -311,6 +501,7 @@ class CompOpCost(OpCost): def __init__(self, op=None, op_desc=None, cluster=None): super(CompOpCost, self).__init__(op=op, op_desc=op_desc) self._check_comp_op_type() + self._cost = self.calc_cost() self.cluster = cluster @classmethod @@ -325,18 +516,22 @@ def register_op_cost(cls): op_type = cls.OP_TYPE def register(op_type): - OP_COST_FACTORY[op_type] = cls + global _g_op_cost_factory + _g_op_cost_factory[op_type] = cls - return register(op_type) + register(op_type) + return cls -def calc_time_from_model(op=None, desc=None, cluster=None, comm_context=None): +def calc_time_by_modeling(op=None, desc=None, cluster=None): op_type = op.type if op is not None else desc["op"] if op_type in COMM_OP_TYPE: - op_cost = OP_COST_FACTORY[op_type](op=op, - op_desc=desc, - comm_context=comm_context) + op_cost = _g_op_cost_factory[op_type](op=op, + op_desc=desc, + comm_context=CommContext(cluster)) elif op_type not in NON_COMP_TYPE: - op_cost = OP_COST_FACTORY[op_type](op=op, op_desc=desc, cluster=cluster) + op_cost = _g_op_cost_factory[op_type](op=op, + op_desc=desc, + cluster=cluster) time = op_cost.calc_time() return time diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py index 359f6b6e7862c..235741ba12f4f 100644 --- a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License -from .base_cost import register_op_cost, CommOpCost, OP_COST_FACTORY +from .base_cost import register_op_cost, CommOpCost @register_op_cost @@ -20,7 +20,7 @@ class AllreduceSumCost(CommOpCost): OP_TYPE = "c_allreduce_sum" def __init__(self, op=None, op_desc=None, comm_context=None): - super(OP_COST_FACTORY["c_allreduce_sum"], self).__init__( + super(AllreduceSumCost, self).__init__( op=op, op_desc=op_desc, comm_context=comm_context) def calc_time(self): diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py index c4d88cb25dc1e..067ad48028d82 100644 --- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License -from .base_cost import Cost, register_op_cost, CompOpCost, OP_COST_FACTORY +from .base_cost import Cost, register_op_cost, CompOpCost @register_op_cost @@ -20,7 +20,7 @@ class MatmulV2OpCost(CompOpCost): OP_TYPE = "matmul_v2" def __init__(self, op=None, op_desc=None, cluster=None): - super(OP_COST_FACTORY["matmul_v2"], self).__init__( + super(MatmulV2OpCost, self).__init__( op=op, op_desc=op_desc, cluster=cluster) # For a concrete COMP OP, the calc_time and calc_flops function needs to be overrided diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py index 8ec702ffcb0b6..6fa5b756c75c3 100644 --- a/python/paddle/distributed/auto_parallel/dist_attribute.py +++ b/python/paddle/distributed/auto_parallel/dist_attribute.py @@ -52,7 +52,7 @@ def append_op_output_suffix(name): class TensorDistributedAttribute: def __init__(self): - # The process mesh of distributed operator attribute must is the same as + # The process mesh of distributed operator attribute must is the same as # the process meshes of all input and output distributed attributed self._process_mesh = None self._dims_mapping = None @@ -132,12 +132,29 @@ def init(self, dist_attr): key, dist_attr) self._is_annotated = copy.deepcopy(dist_attr._is_annotated) + # def reset(self, skip_dist_attr_field_names): + # if skip_dist_attr_field_names is not None \ + # and "process_mesh" not in skip_dist_attr_field_names: + # self._process_mesh = None + # if skip_dist_attr_field_names is not None \ + # and "dims_mapping" not in skip_dist_attr_field_names: + # for i in enumerate(self._dims_mapping): + # self._dims_mapping[i] = -1 + # self._is_annotated = {} + def is_annotated(self, dist_attr_field_name): return self._is_annotated.get(dist_attr_field_name, False) + # def mark_annotated_all(self): + # for key in get_tensor_dist_attr_field_keys(): + # self.mark_annotated(key) + def mark_annotated(self, dist_attr_field_name): self._is_annotated[dist_attr_field_name] = True + # def unmark_annotated(self, dist_attr_field_name): + # self._is_annotated[dist_attr_field_name] = False + def mark_annotated_as(self, dist_attr): if dist_attr is None: return @@ -195,7 +212,7 @@ def process_mesh(self, process_mesh): if isinstance(process_mesh, list): process_mesh = ProcessMesh(process_mesh) self._process_mesh = copy.deepcopy(process_mesh) - # In while op, the proess mesh is not shared by all inputs and outputs + # In while op, the proess mesh is not shared by all inputs and outputs if self._op_type == "while": return None for dist_attr in self._inputs_dist_attrs.values(): @@ -357,9 +374,25 @@ def init(self, dist_attr): "ProcessMeshes in DistributedOperator must be the same." self.process_mesh = shared_process_mesh + # def reset(self, skip_dist_attr_field_names): + # for tensor_dist_attr in self.inputs_dist_attrs.values(): + # tensor_dist_attr.reset(skip_dist_attr_field_names) + # for tensor_dist_attr in self.outputs_dist_attrs.values(): + # tensor_dist_attr.reset(skip_dist_attr_field_names) + # if skip_dist_attr_field_names is not None \ + # and "process_mesh" not in skip_dist_attr_field_names: + # self.process_mesh = None + # self.impl_type = "default" + # self.impl_idx = 0 + # self._is_annotated = {} + def is_annotated(self, attr_name): return self._is_annotated.get(attr_name, False) + # def mark_annotated_all(self): + # for key in get_op_dist_attr_field_keys(): + # self.mark_annotated(key) + def mark_annotated(self, attr_name): if attr_name == "process_mesh": # Make sure proscess_mesh be annotated consistently @@ -452,10 +485,10 @@ def __str__(self): self.process_mesh) for arg_name, tensor_dist_attr in self.inputs_dist_attrs.items(): - str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr) + str += "\n\t\t{}'s (input): {},".format(arg_name, tensor_dist_attr) for arg_name, tensor_dist_attr in self.outputs_dist_attrs.items(): - str += "\n\t\t{}'s: {},".format(arg_name, tensor_dist_attr) + str += "\n\t\t{}'s (output): {},".format(arg_name, tensor_dist_attr) str += "\n\t\timpl type: {}, ".format(self._impl_type) str += "impl idx: {}".format(self._impl_idx) diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index 7e245358d4bcc..f9d77a0077c56 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -14,9 +14,11 @@ import copy from collections import defaultdict +import paddle.fluid from paddle.fluid import framework from paddle.fluid.framework import get_flags, set_flags from paddle.fluid import core +from paddle.distributed.passes import PassContext from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute from .dist_tensor import DistributedTensor @@ -53,49 +55,105 @@ class DistributedContext: def __init__(self, serial_main_prog=None, serial_startup_prog=None, - dist_main_progs=None, - dist_startup_progs=None): - # Program related data members - self._serial_program = serial_main_prog - self._is_initialized_for_program = False + serial_optimizer=None, + serial_loss=None, + feed_vars=None, + fetch_vars=None, + strategy=None): + # Data members related to original programs (unchanged) + self._original_serial_main_program = serial_main_prog + self._original_serial_startup_program = serial_startup_prog + self._original_serial_loss = serial_loss + self._original_serial_optimizer = serial_optimizer + if self._original_serial_main_program is None: + self._original_serial_main_program = paddle.fluid.default_main_program( + ) + if self._original_serial_startup_program is None: + self._original_serial_startup_program = paddle.fluid.default_startup_program( + ) + + # Data members related to programs (changed) + self._serial_main_program = None + self._serial_startup_program = None + self._serial_loss = serial_loss + self._serial_optimizer = serial_optimizer + self._serial_feed_vars = feed_vars + self._serial_fetch_vars = fetch_vars + + # Data members related to the program self._dist_tensors_for_program = {} self._dist_ops_for_program = {} self._block_state = BlockState() - # Graph related data members - self._is_initialized_for_graph = False + + # Data members related to the graph self._serial_graph = None self._dist_tensors_for_graph = {} self._dist_ops_for_graph = {} self._node_id_to_tensor_id = {} self._node_id_to_op_id = {} - # Other data members + + # Data members related to the distributed programs + # Distributed programs + self._dist_main_programs = {} + self._dist_startup_programs = {} + + # Distributed Strategy + self._strategy = strategy + + # Pass Context + self._pass_context = PassContext() + + # Distributed Operator Context self._dist_op_context = DistributedOperatorContext() + + # Other data members self._process_meshes = [] + self._serial_ordered_tensor_nodes = [] + self._serial_ordered_op_nodes = [] self._serial_ordered_nodes = [] - self._tensor_id_to_tensor_node_ids = {} + # self._tensor_id_to_tensor_node_ids = {} - # Distributed programs - self._dist_main_programs = dist_main_progs - if not self._dist_main_programs: - self._dist_main_programs = {} - self._dist_startup_programs = dist_startup_progs - if not self._dist_startup_programs: - self._dist_startup_programs = {} + self._is_initialized = False @property - def serial_program(self): - return self._serial_program + def serial_main_program(self): + return self._serial_main_program + + @serial_main_program.setter + def serial_main_program(self, program): + # if self._serial_main_program: + # print("WARNING: The program attached to this distributed context will be replaced by the new one.") + self._original_serial_main_program = program + self._serial_main_program = program + + @property + def serial_startup_program(self): + return self._serial_startup_program + + @property + def serial_loss(self): + return self._serial_loss + + @property + def serial_optimizer(self): + return self._serial_optimizer + + @property + def serial_feed_vars(self): + return self._serial_feed_vars + + @property + def serial_fetch_vars(self): + return self._serial_fetch_vars + + @property + def strategy(self): + return self._strategy @property def serial_graph(self): return self._serial_graph - @serial_program.setter - def serial_program(self, program): - # assert self._serial_program is None, \ - # "This distributed context has already been realted to a serial program" - self._serial_program = program - @property def serial_ordered_nodes(self): return self._serial_ordered_nodes @@ -104,6 +162,10 @@ def serial_ordered_nodes(self): def process_meshes(self): return self._process_meshes + @property + def pass_context(self): + return self._pass_context + @property def dist_op_context(self): return self._dist_op_context @@ -121,10 +183,64 @@ def dist_startup_programs(self): return self._dist_startup_programs @property - def is_annotation(self): + def has_annotation(self): return len(self._dist_tensors_for_program) or len( self._dist_ops_for_program) + def initialize(self): + if not self._is_initialized: + self._serial_main_program = self._original_serial_main_program.clone( + ) + self._serial_startup_program = self._original_serial_startup_program.clone( + ) + self._serial_main_program = self._original_serial_main_program + self._serial_startup_program = self._original_serial_startup_program + self._serial_loss = self._original_serial_loss + self._serial_optimizer = self._original_serial_optimizer + self._init_dist_attr_for_program() + self._tensors_ids = list(self._dist_tensors_for_program.keys()) + self._ops_ids = list(self._dist_ops_for_program.keys()) + set_flags({"FLAGS_convert_all_blocks": True}) + self._serial_graph = framework.IrGraph( + core.Graph(self._serial_main_program.desc)) + self._init_dist_attr_for_graph() + self._is_initialized = True + + # def reset(self, + # skip_dist_tensors=None, + # skip_dist_ops=None, + # skip_tensor_dist_attr_fields=None, + # skip_op_dist_attr_fields=None): + # self._serial_main_program = self._original_serial_main_program.clone() + # self._serial_startup_program = self._original_serial_startup_program.clone() + # new_tensors_ids = [] + # for tensor_id, dist_tensor in self._dist_tensors_for_program.items(): + # if tensor_id in self._tensors_ids: + # dist_tensor.dist_attr.reset(skip_tensor_dist_attr_fields) + # else: + # new_tensors_ids.append(tensor_id) + # for tensor_id in new_tensors_ids: + # self._dist_tensors_for_program.pop(tensor_id) + # new_ops_ids = [] + # for op_id, dist_op in self._dist_ops_for_program.items(): + # if op_id in self._ops_ids: + # dist_op.dist_attr.reset(skip_op_dist_attr_fields) + # else: + # new_ops_ids.append(op_id) + # for op_id in new_ops_ids: + # self._dist_ops_for_program.pop(op_id) + + # self.copy_dist_attr_from_program_to_graph() + + # self._dist_main_programs = {} + # self._dist_startup_programs = {} + + # self._pass_context = PassContext() + + # self._dist_op_context = DistributedOperatorContext() + + # self._process_meshes = [] + def add_process_mesh(self, process_mesh): assert isinstance(process_mesh, ProcessMesh), \ 'The type of dim_mapping must be ProcessMesh.' @@ -133,12 +249,12 @@ def add_process_mesh(self, process_mesh): def add_dist_tensor_for_program(self, dist_tensor): inner_serial_tensor = dist_tensor.serial_tensor - inner_serial_tensor_id = inner_serial_tensor.desc.id() + inner_serial_tensor_id = inner_serial_tensor.desc.original_id() self._dist_tensors_for_program[inner_serial_tensor_id] = dist_tensor def add_dist_op_for_program(self, dist_op): inner_serial_op = dist_op.serial_op - inner_serial_op_id = inner_serial_op.desc.id() + inner_serial_op_id = inner_serial_op.desc.original_id() self._dist_ops_for_program[inner_serial_op_id] = dist_op def get_dist_tensor_for_program(self, serial_tensor): @@ -215,18 +331,6 @@ def get_tensor_dist_attr_for_graph(self, serial_tensor_node): else: return None - # def set_tensor_dist_attr_for_graph(self, serial_tensor_node, dist_attr): - # assert serial_tensor_node.is_var() and \ - # serial_tensor_node.var() is not None - # serial_tensor_id = serial_tensor_node.node.original_desc_id() - # dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, None) - # assert dist_tensor is not None, \ - # "The distributed tensor of the program has not been added to this context." - # serial_tensor_node_id = serial_tensor_node.id() - # new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor, - # dist_attr) - # self._dist_tensors_for_graph[serial_tensor_node_id] = new_dist_tensor - def get_op_dist_attr_for_program(self, serial_op): serial_op_id = serial_op.desc.id() dist_op = self._dist_ops_for_program.get(serial_op_id, None) @@ -259,17 +363,6 @@ def get_op_dist_attr_for_graph(self, serial_op_node): else: return None - # def set_op_dist_attr_for_graph(self, serial_op_node, dist_attr): - # assert serial_op_node.is_op() and \ - # serial_op_node.op() is not None - # serial_op_id = serial_op_node.node.original_desc_id() - # dist_op = self._dist_ops_for_program.get(serial_op_id, None) - # assert dist_op is not None, \ - # "The distributed operator of the program has not been added to this context." - # serial_op_node_id = serial_op_node.id() - # new_dist_op = DistributedOperator(dist_op.serial_op, dist_attr) - # self._dist_ops_for_graph[serial_op_node_id] = new_dist_op - def get_dist_attr_for_graph(self, serial_node): if serial_node.is_var() and serial_node.var() is not None: serial_tensor_node_id = _node_id(serial_node) @@ -288,15 +381,14 @@ def get_dist_attr_for_graph(self, serial_node): return None return None - def init_dist_attr_for_program(self): - assert self._serial_program, \ - "Please set the program of this context before initializing its distribute attributes." - if self._is_initialized_for_program: - return + def _init_dist_attr_for_program(self, no_default=False): # Copy the dist tensors and dist ops annotated by users from the default context - default_ctx = get_default_distributed_context() - self._process_meshes = copy.deepcopy(default_ctx.process_meshes) - for block in self._serial_program.blocks: + if not no_default: + default_ctx = get_default_distributed_context() + self._process_meshes = copy.deepcopy(default_ctx.process_meshes) + else: + default_ctx = self + for block in self._serial_main_program.blocks: for tensor in block.vars.values(): # Copy the distributed tensors in the default context default_dist_tensor = default_ctx.get_dist_tensor_for_program( @@ -316,9 +408,8 @@ def init_dist_attr_for_program(self): if current_dist_op is None: dist_op = DistributedOperator(op) self.add_dist_op_for_program(dist_op) - self._is_initialized_for_program = True - def order_nodes_by_program_order(self): + def _order_nodes_by_program_order(self): def _contains(nodes, target_node): for node in nodes: if _node_id(node) == _node_id(target_node): @@ -328,7 +419,6 @@ def _contains(nodes, target_node): serial_ordered_tensor_nodes = [] serial_ordered_op_nodes = [] all_nodes = [] - # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): for node in graph.all_nodes(): all_nodes.append(node) @@ -346,33 +436,35 @@ def _contains(nodes, target_node): new_serial_ordered_tensor_nodes = [] new_serial_ordered_op_nodes = [] + new_serial_ordered_nodes = [] for op_node in serial_ordered_op_nodes: tensor_nodes = [] for tensor_node in op_node.inputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ - and not _contains(self._serial_ordered_nodes, tensor_node): + and not _contains(new_serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) new_serial_ordered_tensor_nodes.append(tensor_node) tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) - self._serial_ordered_nodes.extend(tensor_nodes) - self._serial_ordered_nodes.append(op_node) + new_serial_ordered_nodes.extend(tensor_nodes) + new_serial_ordered_nodes.append(op_node) new_serial_ordered_op_nodes.append(op_node) tensor_nodes = [] for tensor_node in op_node.outputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ - and not _contains(self._serial_ordered_nodes, tensor_node): + and not _contains(new_serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) new_serial_ordered_tensor_nodes.append(tensor_node) tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) - self._serial_ordered_nodes.extend(tensor_nodes) + new_serial_ordered_nodes.extend(tensor_nodes) new_serial_ordered_tensor_nodes.sort( key=lambda node: node.node.original_desc_id()) new_serial_ordered_op_nodes.sort( key=lambda node: node.node.original_desc_id()) self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes self._serial_ordered_op_nodes = new_serial_ordered_op_nodes + self._serial_ordered_nodes = new_serial_ordered_nodes assert len(self._serial_ordered_nodes) == len( self._serial_ordered_tensor_nodes) + len( self._serial_ordered_op_nodes) @@ -385,16 +477,9 @@ def _contains(nodes, target_node): "WARNING: there are some orphan tensors or ops which are not used in the execution." ) - def init_dist_attr_for_graph(self): - assert self._is_initialized_for_program, \ - "The program must be initialized before initializing the distributed attributes for its graph." - if self._is_initialized_for_graph: - return - # Convert program to graph - set_flags({"FLAGS_convert_all_blocks": True}) - self._serial_graph = framework.IrGraph( - core.Graph(self._serial_program.desc)) - self.order_nodes_by_program_order() + def _init_dist_attr_for_graph(self): + # Convert program to graph and initialize the distributed attributes + self._order_nodes_by_program_order() for node in self.serial_ordered_nodes: if node.is_var() and node.var() is not None: dist_tensor = None @@ -428,7 +513,6 @@ def init_dist_attr_for_graph(self): new_dist_op = DistributedOperator(dist_op.serial_op, dist_op.dist_attr) self._dist_ops_for_graph[serial_op_node_id] = new_dist_op - self._is_initialized_for_graph = True def clear_dist_info_for_program(self): self._dist_tensors_for_program.clear() @@ -438,8 +522,40 @@ def clear_dist_info_for_graph(self): self._dist_tensors_for_graph.clear() self._dist_ops_for_graph.clear() + def copy_dist_attr_from_program_to_graph(self): + for node in self.serial_ordered_nodes: + if node.is_var() and node.var() is not None: + dist_tensor = None + tensor_id = node.node.original_desc_id() + for cur_tensor_id, cur_dist_tensor in self._dist_tensors_for_program.items( + ): + if tensor_id == cur_tensor_id \ + or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id(): + dist_tensor = cur_dist_tensor + assert dist_tensor is not None, \ + "Tensor must have a distributed tensor after the initialization for program." + serial_tensor_node_id = _node_id(node) + new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor, + dist_tensor.dist_attr) + self._dist_tensors_for_graph[ + serial_tensor_node_id] = new_dist_tensor + if node.is_op() and node.op() is not None: + dist_op = None + op_id = node.node.original_desc_id() + for cur_op_id, cur_dist_op in self._dist_ops_for_program.items( + ): + if op_id == cur_op_id \ + or op_id == cur_dist_op.serial_op.desc.original_id(): + dist_op = cur_dist_op + assert dist_op is not None, \ + "Operator must have a distributed operator after the initialization for program." + serial_op_node_id = _node_id(node) + new_dist_op = DistributedOperator(dist_op.serial_op, + dist_op.dist_attr) + self._dist_ops_for_graph[serial_op_node_id] = new_dist_op + def copy_dist_attr_from_graph_to_program(self): - assert self._is_initialized_for_program and self._is_initialized_for_graph, \ + assert self._is_initialized, \ "Both program and graph must be initialized." updated_tensors = {} # all_nodes = self._serial_graph.all_nodes() @@ -461,7 +577,7 @@ def copy_dist_attr_from_graph_to_program(self): op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node) dist_op_for_program = self._dist_ops_for_program[op_id] dist_op_for_program.dist_attr = op_dist_attr_for_graph - # TODO: the completion algorithm will skip orphan tensors, + # TODO: the completion algorithm will skip orphan tensors, # here we just set there process_mesh to the first one. for orphan_node in self._serial_orphan_tensor_nodes: serial_tensor_id = orphan_node.var().id() @@ -532,21 +648,27 @@ def amend_dist_attr_for_program(self): dims_mapping[i] = -1 def validate_dist_attr_for_program(self): - if not self._is_initialized_for_program: + if not self._is_initialized: assert False, \ "Program must be initialized before validating its distributed attributes" - for block in self.serial_program.blocks: + for block in self.serial_main_program.blocks: for tensor in block.vars.values(): dist_tensor = self.get_dist_tensor_for_program(tensor) + assert dist_tensor is not None, \ + "Tensor {} does not have a distributed attribute.".format( + dist_tensor.serial_tensor.name) if (dist_tensor is not None) and ( not dist_tensor.validate_dist_attr()): assert False, "Tensor {} has a wrong distributed attributes {}.".format( dist_tensor.serial_tensor.name, dist_tensor.dist_attr) for op in block.ops: dist_op = self.get_dist_op_for_program(op) + assert dist_op is not None, \ + "Operator {} does not have a distributed attribute.".format( + dist_op.serial_op.type) if (dist_op is not None) and (not dist_op.validate_dist_attr()): assert False, "Operator {} has a wrong distributed attributes {}.".format( - dist_op.serial_op.type, dist_tensor.dist_attr) + dist_op.serial_op.type, dist_op.dist_attr) return True def __deepcopy__(self, memo): @@ -554,10 +676,12 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): - if k == "_serial_program" or k == "_serial_graph" \ - or k == "_dist_main_programs" or k == "_dist_startup_programs" \ - or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \ - or k == "_serial_ordered_op_nodes": + if k in [ + "_original_serial_main_program", "_original_serial_startup_program", \ + "_serial_main_program", "_serial_startup_program", "_serial_graph", \ + "_dist_main_programs", "_dist_startup_programs", \ + "_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \ + "_serial_ordered_op_nodes"]: setattr(result, k, v) else: setattr(result, k, copy.deepcopy(v, memo)) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 2cd841ef80979..b9ee6d93fd209 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -34,12 +34,9 @@ from paddle.distributed.utils import get_logger from paddle.distributed.passes import new_pass, PassContext -from .mapper import mapping from .cluster import Cluster -from .reshard import Resharder -from .planner import Planner -from .completion import Completer -from .partitioner import Partitioner +from .planner_v2 import Planner +from .parallelizer_v2 import Parallelizer from .dist_op import DistributedOperator from .dist_saver import DistributedSaver from .dist_loader import NonIterableGeneratorLoader @@ -79,7 +76,6 @@ def __init__(self, self._dist_main_progs = defaultdict(dict) # dist main programs self._dist_startup_progs = defaultdict(dict) # dist startup programs self._dist_contexts = {} - self._pass_contexts = {} self._feed_vars = {} self._fetch_vars = {} @@ -94,10 +90,27 @@ def prepare(self, self._loss = loss self._metrics = to_list(metrics) self._mode = mode - self._build(mode) # build forward program - self._plan(mode) # completion & planner - self._parallel(mode, all_ranks) # parallel - self._initialize(mode) # init comm and startup program + # Build forward program + self._build(mode) + # Do the planning process + planner = Planner(mode, self._dist_contexts[mode]) + planner.plan() + # Parallelize program based on the planner's results + # For now, the completer has to be passed to the planner, + # because we may use it to complete the annotation of the backwarkward and update. + parallelizer = Parallelizer(mode, planner.completer, + self._dist_contexts[mode]) + if not all_ranks: + parallelizer.parallel(self._cur_rank) + else: + parallelizer.parallel_all() + # Get the distributed main programs and startup programs + self._dist_main_progs[mode] = self._dist_contexts[ + mode].dist_main_programs + self._dist_startup_progs[mode] = self._dist_contexts[ + mode].dist_startup_programs + # Init comm and startup program + self._initialize(mode) def _build(self, mode): serial_main_prog = self._serial_main_progs.get(mode, None) @@ -118,11 +131,10 @@ def _build(self, mode): losses = to_list(self._loss(*(outputs + labels))) default_ctx = get_default_distributed_context() - if not default_ctx.is_annotation or self._default_strategy: + if not default_ctx.has_annotation or self._default_strategy: inputs = [self._set_data_parallel(var) for var in inputs] labels = [self._set_data_parallel(var) for var in labels] - # print(serial_main_prog) self._feed_vars[mode] = {"inputs": inputs, "labels": labels} self._fetch_vars[mode] = { @@ -134,34 +146,9 @@ def _build(self, mode): self._serial_main_progs[mode] = serial_main_prog self._serial_startup_progs[mode] = serial_startup_prog self._dist_contexts[mode] = DistributedContext( - serial_main_prog, serial_startup_prog, self._dist_main_progs[mode], - self._dist_startup_progs[mode]) - self._pass_contexts[mode] = PassContext() - - def _plan(self, mode): - - # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need - # dependency of backward-forward ops in forward completition. - defualt_ctx = get_default_distributed_context() - self._dist_contexts[mode]._dist_op_context = defualt_ctx.dist_op_context - - # Complete the distributed annotation - serial_main_prog = self._serial_main_progs[mode] - self._completer = Completer(self._dist_contexts[mode]) - self._completer.complete_forward_annotation(serial_main_prog) - # TODO: add auto planner process - # parse forward sub block - self._dist_contexts[mode].block_state.parse_forward_blocks( - serial_main_prog) - - def _parallel(self, mode, all_ranks=False): - if not all_ranks: - self._parallel_program(mode, self._cur_rank) - else: - world_process_group = get_world_process_group() - all_ranks = world_process_group.ranks - for rank in all_ranks: - self._parallel_program(mode, rank) + self._serial_main_progs[mode], self._serial_startup_progs[mode], + self._optimizer, losses, self._feed_vars[mode], + self._fetch_vars[mode], self.strategy) def _initialize(self, mode): if self._nranks > 1: @@ -190,131 +177,6 @@ def _initialize(self, mode): prune_startup_prog = dist_startup_prog._prune(uninitialized) self._executor.run(prune_startup_prog) - def _parallel_program(self, mode, rank): - serial_main_program = self._serial_main_progs[mode] - serial_startup_program = self._serial_startup_progs[mode] - dist_context = self._dist_contexts[mode] - if mode == "train" and self._optimizer: - # Generate backward - serial_loss = self._fetch_vars[mode]["loss"][0] - params_grads = self._generate_backward( - serial_main_program, serial_startup_program, serial_loss) - # Apply pre optimization passes - self._apply_pre_optimization(serial_main_program, - serial_startup_program, serial_loss, - params_grads) - # Do logical partition - partitioner = Partitioner(dist_context, rank) - dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( - serial_main_program, serial_startup_program, params_grads) - # Generate optimizer - self._generate_optimizer(dist_main_prog, dist_startup_prog, - dist_params_grads) - # Do reshard process - set_grad_var_shape(dist_main_prog, dist_context) - make_data_unshard(dist_main_prog, dist_startup_prog, dist_context) - resharder = Resharder(dist_main_prog, dist_startup_prog, rank, - dist_context, dist_params_grads) - resharder.reshard() - # Apply post optimization passes - self._apply_post_optimization(dist_main_prog, dist_startup_prog, - rank, dist_params_grads) - else: - # Apply pre optimization passes - self._apply_pre_optimization(serial_main_program, - serial_startup_program, None, None) - # Do logical partition - partitioner = Partitioner(dist_context, rank) - dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( - serial_main_program, serial_startup_program, []) - # Do reshard process - make_data_unshard(dist_main_prog, dist_startup_prog, dist_context) - resharder = Resharder(dist_main_prog, dist_startup_prog, rank, - dist_context, [], 1) - resharder.reshard() - - # clone program for test - if mode != 'train': - dist_main_prog = dist_main_prog.clone(for_test=True) - dist_startup_prog = dist_startup_prog.clone(for_test=True) - - self._dist_main_progs[mode][rank] = dist_main_prog - self._dist_startup_progs[mode][rank] = dist_startup_prog - - def _generate_backward(self, main_program, startup_program, loss): - with program_guard(main_program, startup_program): - params_grads = append_backward( - loss, - distop_context=self._dist_contexts[self.mode].dist_op_context) - self._completer.complete_backward_annotation(main_program) - self._dist_contexts[self.mode].block_state.parse_backward_blocks( - main_program) - return params_grads - - def _generate_optimizer(self, main_program, startup_program, params_grads): - with program_guard(main_program, startup_program): - optimizer_ops = copy.deepcopy(self._optimizer).apply_gradients( - params_grads) - self._completer.complete_update_annotation(main_program) - return optimizer_ops - - def _apply_pre_optimization(self, main_program, startup_program, loss, - params_grads): - - # apply amp pass - if self.strategy.amp: - config = copy.deepcopy(self.strategy.amp_configs) - config["dist_context"] = self._dist_contexts[self.mode] - config["params_grads"] = params_grads - config["loss"] = loss - config["input_data"] = self._feed_vars[self.mode][ - "inputs"] + self._feed_vars[self.mode]["labels"] - if config["use_pure_fp16"]: - config["base_opt"] = self._optimizer - auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config) - auto_parallel_fp16_pass.apply([main_program], - [startup_program], - self._pass_contexts[self.mode]) - else: - auto_parallel_amp_pass = new_pass("auto_parallel_amp", config) - auto_parallel_amp_pass.apply([main_program], [startup_program], - self._pass_contexts[self.mode]) - - # apply recompute pass - if self.strategy.recompute: - config = copy.deepcopy(self.strategy.recompute_configs) - config["dist_context"] = self._dist_contexts[self.mode] - config["no_grad_set"] = None - config["loss"] = loss - auto_parallel_recompute_pass = new_pass("auto_parallel_recompute", - config) - auto_parallel_recompute_pass.apply([main_program], - [startup_program], - self._pass_contexts[self.mode]) - - def _apply_post_optimization(self, main_program, startup_program, rank, - params_grads): - if self.strategy.sharding: - config = copy.deepcopy(self.strategy.sharding_configs) - config["dist_context"] = self._dist_contexts[self.mode] - config["params_grads"] = params_grads - config["global_rank"] = rank - auto_parallel_sharding_pass = new_pass("auto_parallel_sharding", - config) - auto_parallel_sharding_pass.apply([main_program], - [startup_program], - self._pass_contexts[self.mode]) - - if self.strategy.gradient_merge: - config = copy.deepcopy(self.strategy.gradient_merge_configs) - config["dist_context"] = self._dist_contexts[self.mode] - config["params_grads"] = params_grads - auto_parallel_gradient_merge_pass = new_pass( - "auto_parallel_gradient_merge_pass", config) - auto_parallel_gradient_merge_pass.apply( - [main_program], [startup_program], - self._pass_contexts[self.mode]) - def fit(self, train_data, batch_size=1, diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 47f76353e4655..5d43c56827274 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -18,16 +18,16 @@ _g_distributed_operator_impl_containers = {} _g_elementwise_ops = [ - "elementwise_add", "gelu", "dropout", "cast", "gather", "concat" + "elementwise", "gelu", "dropout", "cast", "gather", "concat" ] BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'} def is_elementwise_op(op_type): - if op_type in _g_elementwise_ops: - return True - else: - return False + for eltwise_op in _g_elementwise_ops: + if eltwise_op in op_type: + return True + return False class DistributedOperatorImplContainer: @@ -156,7 +156,9 @@ def register_distributed_operator_impl(op_type, dist_impl): assert False, "Must register distributed operator registry first." -def find_best_compatible_distributed_operator_impl(dist_op, fwd=True): +def find_best_compatible_distributed_operator_impl(dist_op, + fwd=True, + partial=True): """ Here just return the first compatible implemention. This will be improved by cost model in the future. @@ -168,39 +170,55 @@ def find_best_compatible_distributed_operator_impl(dist_op, fwd=True): dist_op_default_impl_container = get_distributed_operator_impl_container( "default") compatible_impls = [] - if fwd: - # First, find impls in the corresponding container - if dist_op_impl_container: - compatible_impls.extend( - dist_op_impl_container.get_input_compatible_impls(dist_op)) - # Second, find impls in the elementwise container - if dist_op_eltwise_impl_container and is_elementwise_op(op_type): - compatible_impls.extend( - dist_op_eltwise_impl_container.get_input_compatible_impls( - dist_op)) - # Third, find impls in the default container - if dist_op_default_impl_container: - compatible_impls.extend( - dist_op_default_impl_container.get_input_compatible_impls( - dist_op)) + if partial: + if fwd: + # First, find impls in the corresponding container + if dist_op_impl_container: + compatible_impls.extend( + dist_op_impl_container.get_input_compatible_impls(dist_op)) + # Second, find impls in the elementwise container + if dist_op_eltwise_impl_container and is_elementwise_op(op_type): + compatible_impls.extend( + dist_op_eltwise_impl_container.get_input_compatible_impls( + dist_op)) + # Third, find impls in the default container + if dist_op_default_impl_container: + compatible_impls.extend( + dist_op_default_impl_container.get_input_compatible_impls( + dist_op)) + else: + # First, find impls in the corresponding container + if dist_op_impl_container: + compatible_impls.extend( + dist_op_impl_container.get_output_compatible_impls(dist_op)) + # Second, find impls in the elementwise container + if dist_op_eltwise_impl_container and is_elementwise_op(op_type): + compatible_impls.extend( + dist_op_eltwise_impl_container.get_output_compatible_impls( + dist_op)) + # Third, find impls in the default container + if dist_op_default_impl_container: + compatible_impls.extend( + dist_op_default_impl_container.get_output_compatible_impls( + dist_op)) else: # First, find impls in the corresponding container if dist_op_impl_container: compatible_impls.extend( - dist_op_impl_container.get_output_compatible_impls(dist_op)) + dist_op_impl_container.get_compatible_impls(dist_op)) # Second, find impls in the elementwise container if dist_op_eltwise_impl_container and is_elementwise_op(op_type): compatible_impls.extend( - dist_op_eltwise_impl_container.get_output_compatible_impls( - dist_op)) + dist_op_eltwise_impl_container.get_compatible_impls(dist_op)) # Third, find impls in the default container if dist_op_default_impl_container: compatible_impls.extend( - dist_op_default_impl_container.get_output_compatible_impls( - dist_op)) + dist_op_default_impl_container.get_compatible_impls(dist_op)) + if compatible_impls: # For now, just return the first compatible impl - best_compatible_impl = compatible_impls[0] + # best_compatible_impl = compatible_impls[0] + best_compatible_impl = compatible_impls else: best_compatible_impl = None return best_compatible_impl diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index 4795050d15dcc..563d247af3bb2 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -53,6 +53,7 @@ def __init__(self, name): def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr + batch_dim_mappings = [] input_names = op_desc.input_names() xshape_arg_names = [] if "XShape" in input_names: @@ -64,14 +65,14 @@ def is_input_compatible(self, dist_op): for mapping in dims_mapping: if mapping != -1: return False - # continue - # if len(dims_mapping) < 1: - # continue + continue if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: if mapping != -1: return False + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: if dims_mapping[0] != -1: return False @@ -79,12 +80,19 @@ def is_input_compatible(self, dist_op): for mapping in dims_mapping[2:]: if mapping != -1: return False + if len(dims_mapping) >= 2: + batch_dim_mappings.append(dims_mapping[1]) + + if compute_compatible_dim_mapping(batch_dim_mappings) is None: + return False + return True def is_output_compatible(self, dist_op): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr output_names = op_desc.output_names() + batch_dim_mappings = [] xshape_arg_names = [] if "XShape" in output_names: xshape_arg_names = op_desc.output("XShape") @@ -95,14 +103,14 @@ def is_output_compatible(self, dist_op): for mapping in dims_mapping: if mapping != -1: return False - # continue - # if len(dims_mapping) < 1: - # continue + continue if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: if mapping != -1: return False + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: if dims_mapping[0] != -1: return False @@ -110,6 +118,12 @@ def is_output_compatible(self, dist_op): for mapping in dims_mapping[2:]: if mapping != -1: return False + if len(dims_mapping) >= 2: + batch_dim_mappings.append(dims_mapping[1]) + + if compute_compatible_dim_mapping(batch_dim_mappings) is None: + return False + return True def is_auto_compatible(self, dist_op): @@ -123,9 +137,12 @@ def is_auto_compatible(self, dist_op): xshape_arg_names = op_desc.input("XShape") for arg_name in op_desc.input_arg_names(): serial_tensor = dist_op.get_serial_input(arg_name) + dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False continue - dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: @@ -150,9 +167,12 @@ def is_auto_compatible(self, dist_op): xshape_arg_names = op_desc.output("XShape") for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) + dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False continue - dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: @@ -181,10 +201,8 @@ def update_dims_mapping(self, dist_op): changed = False op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr - # The following statement will be replaced by a more elegent way - if op_desc.type() == "shape" \ - or op_desc.type() == "slice" \ - or op_desc.type() == "while": + + if op_desc.type() == "while": return False input_names = op_desc.input_names() @@ -229,7 +247,9 @@ def update_dims_mapping(self, dist_op): compatible_dim_mapping = compute_compatible_dim_mapping( batch_dim_mappings) - assert compatible_dim_mapping is not None, "There is no compatible dim mapping." + if compatible_dim_mapping is None: + return False + for arg_name in op_desc.input_arg_names(): serial_tensor = dist_op.get_serial_input(arg_name) if serial_tensor.is_parameter: @@ -251,6 +271,8 @@ def update_dims_mapping(self, dist_op): )[0]) if input_tensor.is_parameter: continue + if op_desc.type() in ["shape", "slice"]: + continue serial_tensor = dist_op.get_serial_output(arg_name) if serial_tensor.is_parameter: continue diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py index 37d7d93a2e934..78589afc498ee 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py @@ -52,36 +52,76 @@ def __init__(self, name): def is_input_compatible(self, dist_op): op_desc = dist_op.serial_op.desc - if is_elementwise_op(op_desc.type()): - return True - else: + if not is_elementwise_op(op_desc.type()): return False + op_dist_attr = dist_op.dist_attr + dims_mapping_list = [] + input_arg_names = op_desc.input_arg_names() + max_dims_mapping_len = -1 + for arg_name in input_arg_names: + dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) + if max_dims_mapping_len < len(dims_mapping): + max_dims_mapping_len = len(dims_mapping) + dims_mapping_list.append(dims_mapping) + + for idx in range(max_dims_mapping_len): + dim_mappings = [] + for dims_mapping in dims_mapping_list: + if idx < len(dims_mapping): + dim_mappings.append(dims_mapping[-(idx + 1)]) + if compute_compatible_dim_mapping(dim_mappings) is None: + return False + return True def is_output_compatible(self, dist_op): op_desc = dist_op.serial_op.desc - op_desc = dist_op.serial_op.desc - if is_elementwise_op(op_desc.type()): - return True - else: + if not is_elementwise_op(op_desc.type()): return False + op_dist_attr = dist_op.dist_attr + dims_mapping_list = [] + output_arg_names = op_desc.output_arg_names() + max_dims_mapping_len = -1 + for arg_name in output_arg_names: + dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) + if max_dims_mapping_len < len(dims_mapping): + max_dims_mapping_len = len(dims_mapping) + dims_mapping_list.append(dims_mapping) + + for idx in range(max_dims_mapping_len): + dim_mappings = [] + for dims_mapping in dims_mapping_list: + if idx < len(dims_mapping): + dim_mappings.append(dims_mapping[-(idx + 1)]) + if compute_compatible_dim_mapping(dim_mappings) is None: + return False + return True def is_auto_compatible(self, dist_op): op_desc = dist_op.serial_op.desc + if not is_elementwise_op(op_desc.type()): + return False op_dist_attr = dist_op.dist_attr dims_mapping_list = [] + input_arg_names = op_desc.input_arg_names() - max_dims_mapping_len = -1 + input_max_dims_mapping_len = -1 for arg_name in input_arg_names: dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if max_dims_mapping_len < len(dims_mapping): - max_dims_mapping_len = len(dims_mapping) + if input_max_dims_mapping_len < len(dims_mapping): + input_max_dims_mapping_len = len(dims_mapping) dims_mapping_list.append(dims_mapping) + output_arg_names = op_desc.output_arg_names() + output_max_dims_mapping_len = -1 for arg_name in output_arg_names: dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - assert len(dims_mapping) == max_dims_mapping_len + if output_max_dims_mapping_len < len(dims_mapping): + output_max_dims_mapping_len = len(dims_mapping) dims_mapping_list.append(dims_mapping) + assert input_max_dims_mapping_len == output_max_dims_mapping_len + max_dims_mapping_len = input_max_dims_mapping_len + for idx in range(max_dims_mapping_len): dim_mappings = [] for dims_mapping in dims_mapping_list: @@ -96,38 +136,62 @@ def update_dims_mapping(self, dist_op): changed = False op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr + dims_mapping_list = [] + input_arg_names = op_desc.input_arg_names() input_dims_mapping_dict = {} input_dims_mapping_lens = {} - max_dims_mapping_len = -1 + input_max_dims_mapping_len = -1 for arg_name in input_arg_names: dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if max_dims_mapping_len < len(dims_mapping): - max_dims_mapping_len = len(dims_mapping) + if input_max_dims_mapping_len < len(dims_mapping): + input_max_dims_mapping_len = len(dims_mapping) input_dims_mapping_dict[arg_name] = dims_mapping input_dims_mapping_lens[arg_name] = len(dims_mapping) - - dims_mapping_list = [] for arg_name in input_arg_names: - if input_dims_mapping_lens[arg_name] < max_dims_mapping_len: - new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)] + if input_dims_mapping_lens[arg_name] < input_max_dims_mapping_len: + new_dims_mapping = [ + -1 for _ in range(input_max_dims_mapping_len) + ] for i in range(input_dims_mapping_lens[arg_name]): - new_idx = (max_dims_mapping_len - + new_idx = (input_max_dims_mapping_len - input_dims_mapping_lens[arg_name]) + i new_dims_mapping[new_idx] = input_dims_mapping_dict[ arg_name][i] dims_mapping_list.append(new_dims_mapping) else: dims_mapping_list.append(input_dims_mapping_dict[arg_name]) + output_arg_names = op_desc.output_arg_names() + output_dims_mapping_dict = {} + output_dims_mapping_lens = {} + output_max_dims_mapping_len = -1 for arg_name in output_arg_names: dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - assert len(dims_mapping) == max_dims_mapping_len - dims_mapping_list.append(dims_mapping) + if output_max_dims_mapping_len < len(dims_mapping): + output_max_dims_mapping_len = len(dims_mapping) + output_dims_mapping_dict[arg_name] = dims_mapping + output_dims_mapping_lens[arg_name] = len(dims_mapping) + for arg_name in output_arg_names: + if output_dims_mapping_lens[arg_name] < output_max_dims_mapping_len: + new_dims_mapping = [ + -1 for _ in range(output_max_dims_mapping_len) + ] + for i in range(output_dims_mapping_lens[arg_name]): + new_idx = (output_max_dims_mapping_len - + output_dims_mapping_lens[arg_name]) + i + new_dims_mapping[new_idx] = output_dims_mapping_dict[ + arg_name][i] + dims_mapping_list.append(new_dims_mapping) + else: + dims_mapping_list.append(output_dims_mapping_dict[arg_name]) + assert input_max_dims_mapping_len == output_max_dims_mapping_len + max_dims_mapping_len = input_max_dims_mapping_len compatible_dims_mapping = compute_compatible_dims_mapping( dims_mapping_list) - assert compatible_dims_mapping is not None, "There is no compatible dim mapping." + if compatible_dims_mapping is None: + return False for arg_name in input_arg_names: if input_dims_mapping_lens[arg_name] < max_dims_mapping_len: @@ -149,11 +213,24 @@ def update_dims_mapping(self, dist_op): changed = True for arg_name in output_arg_names: - dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - if compatible_dims_mapping != dims_mapping: - op_dist_attr.set_output_dims_mapping(arg_name, - compatible_dims_mapping) - changed = True + if output_dims_mapping_lens[arg_name] < max_dims_mapping_len: + new_dims_mapping = [ + -1 for _ in range(output_dims_mapping_lens[arg_name]) + ] + for i in range(output_dims_mapping_lens[arg_name]): + new_idx = (max_dims_mapping_len - + output_dims_mapping_lens[arg_name]) + i + new_dims_mapping[i] = compatible_dims_mapping[new_idx] + if new_dims_mapping != output_dims_mapping_dict[arg_name]: + op_dist_attr.set_output_dims_mapping(arg_name, + new_dims_mapping) + changed = True + else: + if compatible_dims_mapping != output_dims_mapping_dict[ + arg_name]: + op_dist_attr.set_output_dims_mapping( + arg_name, compatible_dims_mapping) + changed = True return changed diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 68167c1c4f7e8..69e1c866de691 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -95,7 +95,8 @@ def _update_dims_mapping_for_matmul(dist_op): broadcast_x_dims_mapping, broadcast_y_dims_mapping, broadcast_out_dims_mapping ]) - assert compatible_dims_mapping is not None, "There is no compatible dim mapping." + if compatible_dims_mapping is None: + return False for i in range(x_dims_mapping_len - 2): new_idx = i + (out_dims_mapping_len - x_dims_mapping_len) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py index ce68e2060218d..89cd2c9d9e41a 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py @@ -117,7 +117,8 @@ def update_dims_mapping(self, dist_op): compatible_dim_mapping = compute_compatible_dim_mapping( batch_dim_mappings) - assert compatible_dim_mapping is not None, "There is no compatible dim mapping." + if compatible_dim_mapping is None: + return False for arg_name in op_desc.input_arg_names(): dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_slice.py b/python/paddle/distributed/auto_parallel/operators/dist_slice.py index 4bc0a471dcf1c..e3da47fd172ea 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_slice.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_slice.py @@ -1,11 +1,11 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,6 +17,7 @@ from .common import register_distributed_operator_impl_container from .common import register_distributed_operator_impl from ..utils import is_dim_shard +from ..utils import compute_compatible_dim_mapping from ..utils import compute_compatible_and_update_dim_mapping from .dist_default import DistributedDefaultImpl0 @@ -47,6 +48,29 @@ def is_input_compatible(self, dist_op): return True def is_output_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + in_name = op_desc.input('Input')[0] + out_name = op_desc.output('Out')[0] + axes = op_desc.attr('axes') + decrease_axis = op_desc.attr('decrease_axis') + in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + ref_indices = [] + for i in range(len(in_dims_mapping)): + if i not in decrease_axis: + ref_indices.append(i) + if ref_indices == []: + assert len(out_dims_mapping) == 1 + if is_dim_shard(out_dims_mapping[0]): + return False + else: + for i in range(len(out_dims_mapping)): + ref_index = ref_indices[i] + if ref_index in axes and is_dim_shard(out_dims_mapping[i]): + return False + return True def is_compatible(self, dist_op): @@ -95,17 +119,30 @@ def update_dims_mapping(self, dist_op): out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) ref_dims_mapping = [] + ref_indices = [] for i in range(len(in_dims_mapping)): if i not in decrease_axis: ref_dims_mapping.append(in_dims_mapping[i]) + ref_indices.append(i) + if ref_dims_mapping == []: ref_dims_mapping = [-1] - - assert len(ref_dims_mapping) == len(out_dims_mapping) - for i in range(len(out_dims_mapping)): - if out_dims_mapping[i] != ref_dims_mapping[i]: - out_dims_mapping[i] = ref_dims_mapping[i] - changed = True + assert len(ref_dims_mapping) == len(out_dims_mapping) + assert ref_dims_mapping[0] == out_dims_mapping[0] + changed = False + else: + assert len(ref_dims_mapping) == len(out_dims_mapping) + for i in range(len(out_dims_mapping)): + compatible_dim_mapping = compute_compatible_dim_mapping( + [out_dims_mapping[i], ref_dims_mapping[i]]) + if compatible_dim_mapping is None: + continue + if ref_dims_mapping[i] != compatible_dim_mapping: + in_dims_mapping[ref_indices[i]] = compatible_dim_mapping + changed = True + if out_dims_mapping[i] != compatible_dim_mapping: + out_dims_mapping[i] = compatible_dim_mapping + changed = True return changed diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index fc5f1686d0f8c..2ea1223c6f2f3 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -230,7 +230,7 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False): g_process_group_map = copy.deepcopy(_g_process_group_map) _g_process_group_map.clear() _g_process_group_map[0] = ProcessGroup(0, []) - for process_mesh in dist_context._process_meshes: + for process_mesh in self._dist_context._process_meshes: _g_process_group_map[0].add_ranks(process_mesh.processes) return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog, g_process_group_map diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py new file mode 100644 index 0000000000000..401b423638cde --- /dev/null +++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py @@ -0,0 +1,172 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from collections import defaultdict + +from paddle.fluid import program_guard +from paddle.fluid.backward import append_backward +from paddle.distributed.passes import new_pass + +from .reshard import Resharder +from .partitioner import Partitioner +from .dist_op import DistributedOperator +from .dist_saver import DistributedSaver +from .dist_loader import NonIterableGeneratorLoader +from .utils import make_data_unshard, set_grad_var_shape +from .utils import print_program_with_dist_attr, to_list +from .process_group import get_all_process_groups, get_world_process_group +from .dist_context import DistributedContext, get_default_distributed_context + + +class Parallelizer: + def __init__(self, mode, completer, dist_context): + self._mode = mode + self._completer = completer + self._dist_context = dist_context + self._dist_context.initialize() + self._pass_context = self._dist_context.pass_context + self._strategy = self._dist_context.strategy + + def parallel_all(self): + world_process_group = get_world_process_group() + all_ranks = world_process_group.ranks + for rank in all_ranks: + self.parallel(rank) + + def parallel(self, rank): + serial_main_program = self._dist_context.serial_main_program + serial_startup_program = self._dist_context.serial_startup_program + serial_optimizer = self._dist_context.serial_optimizer + if self._mode == "train" and serial_optimizer: + # Generate backward + serial_loss = self._dist_context.serial_fetch_vars["loss"][0] + params_grads = self._generate_backward( + serial_main_program, serial_startup_program, serial_loss) + # Apply pre optimization passes + self._apply_pre_optimization(serial_main_program, + serial_startup_program, serial_loss, + serial_optimizer, params_grads) + # Do logical partition + partitioner = Partitioner(self._dist_context, rank) + dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( + serial_main_program, serial_startup_program, params_grads) + # Generate optimizer + self._generate_optimizer(dist_main_prog, dist_startup_prog, + serial_optimizer, dist_params_grads) + # Do reshard process + set_grad_var_shape(dist_main_prog, self._dist_context) + make_data_unshard(dist_main_prog, dist_startup_prog, + self._dist_context) + resharder = Resharder(dist_main_prog, dist_startup_prog, rank, + self._dist_context, dist_params_grads) + resharder.reshard() + # Apply post optimization passes + self._apply_post_optimization(dist_main_prog, dist_startup_prog, + rank, dist_params_grads) + else: + # Apply pre optimization passes + self._apply_pre_optimization( + serial_main_program, serial_startup_program, None, None, None) + # Do logical partition + partitioner = Partitioner(self._dist_context, rank) + dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( + serial_main_program, serial_startup_program, []) + # Do reshard process + make_data_unshard(dist_main_prog, dist_startup_prog, + self._dist_context) + resharder = Resharder(dist_main_prog, dist_startup_prog, rank, + self._dist_context, [], 1) + resharder.reshard() + + # Clone program for test + if self._mode != 'train': + dist_main_prog = dist_main_prog.clone(for_test=True) + dist_startup_prog = dist_startup_prog.clone(for_test=True) + + # Store the distributed programs for further usages + self._dist_context.dist_main_programs[rank] = dist_main_prog + self._dist_context.dist_startup_programs[rank] = dist_startup_prog + + def _generate_backward(self, main_program, startup_program, loss): + with program_guard(main_program, startup_program): + params_grads = append_backward( + loss, distop_context=self._dist_context.dist_op_context) + self._completer.complete_backward_annotation(main_program) + self._dist_context.block_state.parse_backward_blocks(main_program) + return params_grads + + def _generate_optimizer(self, main_program, startup_program, optimizer, + params_grads): + with program_guard(main_program, startup_program): + optimizer_ops = copy.deepcopy(optimizer).apply_gradients( + params_grads) + self._completer.complete_update_annotation(main_program) + return optimizer_ops + + def _apply_pre_optimization(self, main_program, startup_program, loss, + optimizer, params_grads): + if self._strategy is None: + return + # apply amp pass + if self._strategy.amp: + config = copy.deepcopy(self._strategy.amp_configs) + config["dist_context"] = self._dist_context + config["params_grads"] = params_grads + config["loss"] = loss + config["input_data"] = self._dist_context.serial_feed_vars["inputs"] \ + + self._dist_context.serial_feed_vars["labels"] + if config["use_pure_fp16"]: + config["base_opt"] = optimizer + auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config) + auto_parallel_fp16_pass.apply( + [main_program], [startup_program], self._pass_context) + else: + auto_parallel_amp_pass = new_pass("auto_parallel_amp", config) + auto_parallel_amp_pass.apply([main_program], [startup_program], + self._pass_context) + + # apply recompute pass + if self._strategy.recompute: + config = copy.deepcopy(self._strategy.recompute_configs) + config["dist_context"] = self._dist_context + config["no_grad_set"] = None + config["loss"] = loss + auto_parallel_recompute_pass = new_pass("auto_parallel_recompute", + config) + auto_parallel_recompute_pass.apply( + [main_program], [startup_program], self._dist_context) + + def _apply_post_optimization(self, main_program, startup_program, rank, + params_grads): + if self._strategy is None: + return + if self._strategy.sharding: + config = copy.deepcopy(self._strategy.sharding_configs) + config["dist_context"] = self._dist_context + config["params_grads"] = params_grads + config["global_rank"] = rank + auto_parallel_sharding_pass = new_pass("auto_parallel_sharding", + config) + auto_parallel_sharding_pass.apply( + [main_program], [startup_program], self._dist_context) + + if self._strategy.gradient_merge: + config = copy.deepcopy(self._strategy.gradient_merge_configs) + config["dist_context"] = self._dist_context + config["params_grads"] = params_grads + auto_parallel_gradient_merge_pass = new_pass( + "auto_parallel_gradient_merge_pass", config) + auto_parallel_gradient_merge_pass.apply( + [main_program], [startup_program], self._dist_context) diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/planner.py index 73df0da10339e..b97c09bd59da8 100755 --- a/python/paddle/distributed/auto_parallel/planner.py +++ b/python/paddle/distributed/auto_parallel/planner.py @@ -35,7 +35,6 @@ from .dist_context import DistributedContext, DistributedOperatorContext from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute -paddle.enable_static() paddle.seed(123) random.seed(123) np.random.seed(123) diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py new file mode 100755 index 0000000000000..7db17e98d07ee --- /dev/null +++ b/python/paddle/distributed/auto_parallel/planner_v2.py @@ -0,0 +1,42 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .completion import Completer +from .dist_context import get_default_distributed_context +from .utils import print_program_with_dist_attr + + +class Planner: + def __init__(self, mode, dist_context): + self._mode = mode + self._dist_context = dist_context + + # NOTE: [HighOrderGrad]. There are grad ops in forward phase, and it need + # dependency of backward-forward ops in forward completion. + default_ctx = get_default_distributed_context() + self._dist_context._dist_op_context = default_ctx.dist_op_context + self._dist_context.initialize() + + self._completer = Completer(self._dist_context) + + @property + def completer(self): + return self._completer + + def plan(self): + self._completer.complete_forward_annotation() + # parse forward sub block + self._dist_context.block_state.parse_forward_blocks( + self._dist_context.serial_main_program) + # TODO: add the auto searcher diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py index d0f181a635413..ba61843831a25 100644 --- a/python/paddle/distributed/auto_parallel/tuner/recorder.py +++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py @@ -138,7 +138,6 @@ def get_state(self): def from_state(cls, state): records = cls(state["direction"]) records.records = [MetricRecord.from_state(r) for r in state["records"]] - print("here 1", records.records) return records diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 9c40034498dbc..ac07b49f45c3b 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -159,11 +159,11 @@ def print_program_with_dist_attr(program, dist_context=None): from .dist_context import set_default_distributed_context if dist_context is None: dist_context = get_default_distributed_context() - print(program) + print(program, flush=True) else: original_default_context = get_default_distributed_context() set_default_distributed_context(dist_context) - print(program) + print(program, flush=True) set_default_distributed_context(original_default_context) lock.release() diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index e33a3dba669ab..a781f314d3f20 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -226,9 +226,15 @@ def _new_process_group_impl(backend, world_size, group_name, pg_options, - group_id=0): + group_id=0, + src_rank=None, + dst_rank=None): pg = None genv = _get_global_env() + if backend != 'heter': + assert src_rank is None and dst_rank is None, ( + "src_rank and dst_rank " + "can only be set for heter backend.") assert backend in _valid_backend_list, "Unsupported backend: %s." % backend if backend == "gloo": place = core.CPUPlace() @@ -269,7 +275,9 @@ def _new_process_group_impl(backend, gloo_rank=cluster_id, gloo_size=len(cluster_size), with_switch=True, - switch_endpoint=switch_ep) + switch_endpoint=switch_ep, + src_rank=src_rank, + dst_rank=dst_rank) return pg @@ -322,6 +330,16 @@ def barrier(group=None): attrs={'ring_id': ring_id}) +# _custom_gid provides a way for users to +# set the group id, which is usually useful +# to be compatible with the static mode. +_custom_gid = None + + +def _set_custom_gid(gid): + _custom_gid = gid + + def new_group(ranks=None, backend=None): """ @@ -348,9 +366,9 @@ def new_group(ranks=None, backend=None): global _group_map if in_dygraph_mode(): global _default_group_name - gid = _new_ring_id() + gid = _custom_gid if _custom_gid else _new_ring_id() group_name = _default_group_name + str(gid) - if ranks is None or len(ranks) > 1: + if backend != 'heter' and (ranks is None or len(ranks) > 1): global_group = _get_default_group() global_rank = global_group.rank global_ranks = global_group.ranks @@ -362,8 +380,10 @@ def new_group(ranks=None, backend=None): "equal to that of the default global group.") size = len(ranks) ranks = sorted(ranks) - if size > 1 and global_rank in ranks: - rank = ranks.index(global_rank) + if backend == 'heter' or (size > 1 and global_rank in ranks): + rank = 0 if backend == 'heter' else ranks.index(global_rank) + src_rank = ranks[0] if backend == 'heter' else None + dst_rank = ranks[1] if backend == 'heter' else None pg = _new_process_group_impl( backend, _default_store, @@ -371,7 +391,9 @@ def new_group(ranks=None, backend=None): size, group_name, pg_options=None, - group_id=gid) + group_id=gid, + src_rank=src_rank, + dst_rank=dst_rank) else: rank = -1 pg = None diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index c4d42f90615fc..90440ff9d0ea9 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -138,9 +138,16 @@ def _get_hybrid_degree(self): if pp_degree > 1: assert strategy.pipeline is True - assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \ - "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format( - global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree) + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): + assert pp_degree == 2, ("For manually set pipeline, only " + "pp_degree = 2 is supported.") + assert global_world_size == mp_degree * sharding_degree * dp_degree, \ + "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format( + global_world_size, mp_degree, sharding_degree, dp_degree) + else: + assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \ + "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format( + global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree) # FIXME (JZ-LIANG) deprecated hybrid_dp if sharding_configs["hybrid_dp"]: @@ -268,7 +275,11 @@ def _inner_opt_minimize(self, loss, startup_program, parameter_list, if self.pp_degree > 1: startup_program = startup_program._pipeline_opt['startup_program'] print("pp_rank:", self.pp_rank) - main_program = program_list[self.pp_rank] + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): + main_program = program_list[int( + os.getenv("PADDLE_MANUAL_PIPELINE_STAGE"))] + else: + main_program = program_list[self.pp_rank] with open("main_%d" % self.role_maker._worker_index(), 'w') as f: f.writelines(str(main_program)) main_block = main_program.global_block() @@ -633,14 +644,15 @@ def _init_pair_comm(self, pair, ring_id): self.pp_group_endpoints[pair[1]], ] pp_rank = 0 if self.pp_rank == pair[0] else 1 - self._collective_helper._init_communicator( - self._startup_program, - self.current_endpoint, - pp_group_endpoints, - pp_rank, - ring_id, - False, - sync=False) + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None: + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + pp_group_endpoints, + pp_rank, + ring_id, + False, + sync=False) def _init_npu_pipeline_comm(self, startup_block): # NOTE(wangxi): some bug with hccl, must set pp_degree be even number @@ -714,14 +726,15 @@ def _init_npu_pipeline_comm(self, startup_block): def _init_pipeline_comm(self, startup_block): # TODO (JZ-LIANG) to unify pp_rank_ and pp_rank - self._collective_helper._init_communicator( - self._startup_program, - self.current_endpoint, - self.pp_group_endpoints, - self.pp_rank, - self.pp_ring_id, - False, - sync=False) + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None) is None: + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + self.pp_group_endpoints, + self.pp_rank, + self.pp_ring_id, + False, + sync=False) if core.is_compiled_with_npu(): self._init_npu_pipeline_comm(startup_block) @@ -1387,17 +1400,27 @@ def _build_groups(self): # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism # e.g. mp-sharding-pp-dp # sharding-hybrid-dp as one senario of outter-pure-dp - assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format( - self.mp_degree, self.sharding_degree, self.pp_degree, - self.dp_degree, self.global_word_size) + local_pp_degree = self.pp_degree + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): + assert self.pp_degree == 2, ("For manually set pipeline, only " + "pp_degree = 2 is supported.") + assert self.global_word_size == self.mp_degree * self.sharding_degree * self.dp_degree, \ + "global work size [{}], mp_degree [{}], sharding_degree [{}], dp_degree [{}].".format( + self.global_word_size, self.mp_degree, self.sharding_degree, self.dp_degree) + local_pp_degree = 1 + else: + assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format( + self.mp_degree, self.sharding_degree, self.pp_degree, + self.dp_degree, self.global_word_size) if self.dp_degree > 1: self.dp_ring_id = 2 - self.dp_rank = self.global_rank // (self.sharding_degree * - self.mp_degree * self.pp_degree) + self.dp_rank = self.global_rank // ( + self.sharding_degree * self.mp_degree * local_pp_degree) dp_first_rank_idx = self.global_rank % ( - self.sharding_degree * self.mp_degree * self.pp_degree) - dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree) + self.sharding_degree * self.mp_degree * local_pp_degree) + dp_offset = (self.sharding_degree * self.mp_degree * + local_pp_degree) self.dp_group_endpoints = [] for i in range(self.dp_degree): self.dp_group_endpoints.append(self.global_endpoints[ diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py index 185fb453412ea..258f46304d189 100644 --- a/python/paddle/distributed/passes/auto_parallel_recompute.py +++ b/python/paddle/distributed/passes/auto_parallel_recompute.py @@ -350,11 +350,12 @@ def _apply_single_impl(self, main_programs, startup_programs, context): for _, op_desc in reversed(list(enumerate(segment_descs))): rc_desc = main_block.desc._insert_op(idx) rc_desc.copy_from(op_desc) + rc_desc.set_original_id(rc_desc.id()) rc_op = Operator(main_block, rc_desc) main_block.ops.insert(idx, rc_op) # set recomputed ops' dist attr fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program_with_id( - rc_desc.original_id()) + op_desc.original_id()) assert fwd_op_dist_attr is not None self.set_op_dist_attr(rc_op, fwd_op_dist_attr, var_name_dict) diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 9d9fbd39a5767..e8a9300635e2c 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -426,6 +426,7 @@ def _optimize_fp32_graph(self, graph): graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass') graph = self._apply_pass(graph, 'conv_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass') + graph = self._apply_pass(graph, 'conv_affine_channel_mkldnn_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_eltwiseadd_bn_fuse_pass') diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 7c7f101286e24..8b25c93d7ce08 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1846,8 +1846,7 @@ def get_tensor(var): @static_only def save(program, model_path, protocol=4, **configs): """ - :api_attr: Static Graph - + This function save parameters, optimizer information and network description to model_path. The parameters contains all the trainable Tensor, will save to a file with suffix ".pdparams". diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index bb14fb9a86f15..49fb5399d8aec 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -6005,7 +6005,14 @@ def device_cmp(device1, device2): for p in program_list: self._create_vars(p.global_block(), main_block) - self.local_rank %= len(device_list) + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): + self.local_rank = int(os.getenv("PADDLE_MANUAL_PIPELINE_STAGE")) + assert self.local_rank < len(device_list), ( + "Manually specified " + "pipeline stage must be less than total number of pipeline " + "stages.") + else: + self.local_rank %= len(device_list) # Step3.5: optimize forward send sync_comm to overlap send and recv self._optimize_forward_send_sync(program_list[self.local_rank]) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 08e24f86a29a4..0b53046d056ee 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1148,7 +1148,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) - set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120) + set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 200) set_tests_properties(test_dygraph_group_sharded_api PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 4d052f7e90cd3..7c747338593a3 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -3,18 +3,23 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_auto_parallel_relaunch MODULES test_auto_parallel_relaunch ENVS ${dist_ENVS}) set_tests_properties(test_auto_parallel_relaunch PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120) + py_test_modules(test_relaunch_with_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS}) set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120) + py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_gpt_planner ENVS ${dist_ENVS}) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) + py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) - py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) + py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) py_test_modules(test_high_order_grad MODULES test_high_order_grad ENVS ${dist_ENVS}) set_tests_properties(test_high_order_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) + py_test_modules(test_while_op_partition MODULES test_while_op_partition ENVS ${dist_ENVS}) py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS}) py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS}) py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py index 9a9efe7ab2dd0..3f8283866768e 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py @@ -23,6 +23,9 @@ from paddle.incubate.autograd import Hessian from paddle.distributed.auto_parallel.engine import Engine +np.random.seed(1234) +paddle.seed(1234) + class FCNet: def __init__(self, num_ins, num_outs, num_layers, hidden_size): @@ -136,10 +139,8 @@ def main(): inputs_spec=inputs_spec, labels_spec=labels_spec, strategy=dist_strategy) - paddle.seed(1234 + engine._cur_rank) engine.prepare(optimizer=optimizer, loss=loss_func) res = engine.fit(train_dataset, sample_generator=False) - assert np.allclose(res[-1], 2.840593) dist_context = engine.dist_context block = engine.main_program.global_block() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py index f170dbc9095f2..8777bf3ff1f2e 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py @@ -66,7 +66,6 @@ def test_dist_reshape_mp2(self): for rank in range(2): dist_main_prog, dist_context = parallelizer(make_program_dp2, rank) ops = dist_main_prog.global_block().ops - print_program_with_dist_attr(dist_main_prog, dist_context) for idx, op in enumerate(ops): op_dist_attr = dist_context.get_op_dist_attr_for_program(op) assert op_dist_attr.impl_type == "reshape2" diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py index 6cf4621dbb0ce..aa0bf719fab29 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py @@ -15,6 +15,7 @@ import unittest import paddle import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr paddle.enable_static() @@ -78,21 +79,15 @@ def parallelizer(program_func, rank): class TestDistSlice(unittest.TestCase): def test_dist_slice_dp2(self): - for rank in range(2): dist_main_prog, dist_context = parallelizer(make_program_dp2, rank) ops = dist_main_prog.global_block().ops for op in ops: axes = op.desc.attr('axes') op_dist_attr = dist_context.get_op_dist_attr_for_program(op) - if axes[0] == 0: - assert op_dist_attr.impl_type == "default" - else: - assert op_dist_attr.impl_type == "slice" - for out in op.output_arg_names: - var_dims_mapping = op_dist_attr.get_output_dims_mapping( - out) - assert var_dims_mapping[0] == 0 + assert op_dist_attr.impl_type == "slice" + for out in op.output_arg_names: + var_dims_mapping = op_dist_attr.get_output_dims_mapping(out) def test_dist_slice_serial(self): dist_main_prog, dist_context = parallelizer(make_program_serial, 0) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py index 0cd3041ea4d25..6d6fbfe78e9e6 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py @@ -18,7 +18,7 @@ import paddle.distributed.auto_parallel.cost as cost_model from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str -from paddle.distributed.auto_parallel.cost.base_cost import calc_time_from_model +from paddle.distributed.auto_parallel.cost.base_cost import calc_time_by_modeling paddle.enable_static() @@ -45,13 +45,13 @@ def test_comp_cost(self): if op.type == "matmul_v2": matmul_v2_op = op break - matmul_v2_cost = cost_model.OP_COST_FACTORY["matmul_v2"]( + matmul_v2_cost = cost_model._g_op_cost_factory["matmul_v2"]( op=matmul_v2_op) desc = parse_to_desc(op=matmul_v2_op) desc_str = parse_desc_to_str(desc) self.assertIsNotNone(desc_str) self.assertTrue(check_cost(matmul_v2_cost.cost)) - time = calc_time_from_model(op=matmul_v2_op) + time = calc_time_by_modeling(op=matmul_v2_op) self.assertEqual(time, matmul_v2_cost.cost.time) tensor_cost = cost_model.TensorCost(tensor=x) # check memory @@ -61,7 +61,8 @@ def test_comm_cost(self): desc = {} desc["op"] = "c_allreduce_sum" desc["inputs"] = {"X": [([100, 200], paddle.float32)]} - allreduce_cost = cost_model.OP_COST_FACTORY["c_allreduce_sum"]( + desc["group_ranks"] = [0, 1] + allreduce_cost = cost_model._g_op_cost_factory["c_allreduce_sum"]( op_desc=desc) self.assertTrue(check_cost(allreduce_cost.cost)) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py similarity index 60% rename from python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py rename to python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py index 07e6a2c4346da..894bed7108a1d 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py @@ -23,12 +23,13 @@ import paddle.distributed.auto_parallel as auto from paddle.distributed import fleet - +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.partitioner import Partitioner from paddle.distributed.auto_parallel.utils import make_data_unshard from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr paddle.enable_static() @@ -283,139 +284,143 @@ def get_program(): def completion(train_program, start_program, dist_context): - blocks = train_program.blocks - # completion tensors - for block in blocks: - for op in block.ops: - if op.type == "layer_norm": - for out_name in op.output_arg_names: - out_var = block.vars[out_name] - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program( - out_var) - if tensor_dist_attr: - continue - tensor_dist_attr = TensorDistributedAttribute() - tensor_dist_attr.process_mesh = _g_process_mesh - tensor_dist_attr.dims_mapping = [-1] - dist_context.set_tensor_dist_attr_for_program( - out_var, tensor_dist_attr) - - elif op.type == "elementwise_sub": - for out_name in op.output_arg_names: - out_var = block.vars[out_name] - tensor_dist_attr = TensorDistributedAttribute() - tensor_dist_attr.process_mesh = _g_process_mesh - tensor_dist_attr.dims_mapping = [-1, -1, -1] - dist_context.set_tensor_dist_attr_for_program( - out_var, tensor_dist_attr) - - elif op.type == "matmul_v2": - col = False - for in_name in op.input_arg_names: - if ".w_" not in in_name: - continue - if in_name not in block.vars: - in_var = blocks[0].vars[in_name] - else: - in_var = block.vars[in_name] - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program( - in_var) - assert tensor_dist_attr is not None - if tensor_dist_attr.dims_mapping == [-1, 0]: - col = True - for out_name in op.output_arg_names: - out_var = block.vars[out_name] - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program( - out_var) - if tensor_dist_attr: - continue - tensor_dist_attr = TensorDistributedAttribute() - tensor_dist_attr.process_mesh = _g_process_mesh - if col: - tensor_dist_attr.dims_mapping = [-1, -1, 0] - else: - tensor_dist_attr.dims_mapping = [-1, -1, -1] - dist_context.set_tensor_dist_attr_for_program( - out_var, tensor_dist_attr) - elif op.type == "while": - out_name = op.desc.output("StepScopes")[0] - out_var = block.vars[out_name] - tensor_dist_attr = TensorDistributedAttribute() - tensor_dist_attr.process_mesh = _g_process_mesh - tensor_dist_attr.dims_mapping = [-1] - dist_context.set_tensor_dist_attr_for_program(out_var, - tensor_dist_attr) - - # completion ops - for block in blocks: - for op in block.ops: - op_dist_attr = OperatorDistributedAttribute() - op_dist_attr.process_mesh = _g_process_mesh - if op.type == "create_by_read" or op.type == "create_double_buffer_reader": - for in_name in op.input_arg_names: - op_dist_attr.set_input_dims_mapping(in_name, []) - for out_name in op.output_arg_names: - op_dist_attr.set_output_dims_mapping(out_name, []) - elif op.type == "read": - for in_name in op.input_arg_names: - op_dist_attr.set_output_dims_mapping(in_name, []) - for out_name in op.output_arg_names: - out_var = block.vars[out_name] - out_dist_attr = dist_context.get_tensor_dist_attr_for_program( - out_var) - op_dist_attr.set_output_dist_attr(out_name, out_dist_attr) - elif op.type == "while": - for in_name in op.input_arg_names: - in_var = block.vars[in_name] - in_dist_attr = dist_context.get_tensor_dist_attr_for_program( - in_var) - op_dist_attr.set_input_dist_attr(in_name, in_dist_attr) - for out_name in op.output_arg_names: - if out_name == op.desc.output("StepScopes")[0]: - op_dist_attr.set_output_dims_mapping(out_name, []) - else: - out_var = block.vars[out_name] - out_dist_attr = dist_context.get_tensor_dist_attr_for_program( - out_var) - op_dist_attr.set_output_dist_attr(out_name, - out_dist_attr) - else: - for in_name in op.input_arg_names: - if in_name == "lod_tensor_blocking_queue_0": - continue - if in_name not in block.vars: - in_var = blocks[0].vars[in_name] - else: - in_var = block.vars[in_name] - in_dist_attr = dist_context.get_tensor_dist_attr_for_program( - in_var) - op_dist_attr.set_input_dist_attr(in_name, in_dist_attr) - for out_name in op.output_arg_names: - if out_name not in block.vars: - out_var = blocks[0].vars[out_name] - else: - out_var = block.vars[out_name] - out_dist_attr = dist_context.get_tensor_dist_attr_for_program( - out_var) - op_dist_attr.set_output_dist_attr(out_name, out_dist_attr) - - if op.type == "matmul_v2": - op_dist_attr.impl_type = "matmul_v2" - for in_name in op_dist_attr.inputs_dist_attrs.keys(): - in_dist_attr = op_dist_attr.inputs_dist_attrs[in_name] - if ".w_" in in_name and in_dist_attr.dims_mapping[-1] == 0: - op_dist_attr.impl_idx = 0 - else: - op_dist_attr.impl_idx = 1 - elif op.type == "fill_constant_batch_size_like": - op_dist_attr.impl_type = "fill_constant_batch_size_like" - op_dist_attr.impl_idx = 0 - else: - op_dist_attr.impl_type = "default" - op_dist_attr.impl_idx = 0 - - dist_context.set_op_dist_attr_for_program(op, op_dist_attr) - make_data_unshard(train_program, start_program, dist_context) + # blocks = train_program.blocks + # # completion tensors + # for block in blocks: + # for op in block.ops: + # if op.type == "layer_norm": + # for out_name in op.output_arg_names: + # out_var = block.vars[out_name] + # tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # out_var) + # if tensor_dist_attr: + # continue + # tensor_dist_attr = TensorDistributedAttribute() + # tensor_dist_attr.process_mesh = _g_process_mesh + # tensor_dist_attr.dims_mapping = [-1] + # dist_context.set_tensor_dist_attr_for_program( + # out_var, tensor_dist_attr) + + # elif op.type == "elementwise_sub": + # for out_name in op.output_arg_names: + # out_var = block.vars[out_name] + # tensor_dist_attr = TensorDistributedAttribute() + # tensor_dist_attr.process_mesh = _g_process_mesh + # tensor_dist_attr.dims_mapping = [-1, -1, -1] + # dist_context.set_tensor_dist_attr_for_program( + # out_var, tensor_dist_attr) + + # elif op.type == "matmul_v2": + # col = False + # for in_name in op.input_arg_names: + # if ".w_" not in in_name: + # continue + # if in_name not in block.vars: + # in_var = blocks[0].vars[in_name] + # else: + # in_var = block.vars[in_name] + # tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # in_var) + # assert tensor_dist_attr is not None + # if tensor_dist_attr.dims_mapping == [-1, 0]: + # col = True + # for out_name in op.output_arg_names: + # out_var = block.vars[out_name] + # tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # out_var) + # if tensor_dist_attr: + # continue + # tensor_dist_attr = TensorDistributedAttribute() + # tensor_dist_attr.process_mesh = _g_process_mesh + # if col: + # tensor_dist_attr.dims_mapping = [-1, -1, 0] + # else: + # tensor_dist_attr.dims_mapping = [-1, -1, -1] + # dist_context.set_tensor_dist_attr_for_program( + # out_var, tensor_dist_attr) + # elif op.type == "while": + # out_name = op.desc.output("StepScopes")[0] + # out_var = block.vars[out_name] + # tensor_dist_attr = TensorDistributedAttribute() + # tensor_dist_attr.process_mesh = _g_process_mesh + # tensor_dist_attr.dims_mapping = [-1] + # dist_context.set_tensor_dist_attr_for_program(out_var, + # tensor_dist_attr) + + # # completion ops + # for block in blocks: + # for op in block.ops: + # op_dist_attr = OperatorDistributedAttribute() + # op_dist_attr.process_mesh = _g_process_mesh + # if op.type == "create_by_read" or op.type == "create_double_buffer_reader": + # for in_name in op.input_arg_names: + # op_dist_attr.set_input_dims_mapping(in_name, []) + # for out_name in op.output_arg_names: + # op_dist_attr.set_output_dims_mapping(out_name, []) + # elif op.type == "read": + # for in_name in op.input_arg_names: + # op_dist_attr.set_output_dims_mapping(in_name, []) + # for out_name in op.output_arg_names: + # out_var = block.vars[out_name] + # out_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # out_var) + # op_dist_attr.set_output_dist_attr(out_name, out_dist_attr) + # elif op.type == "while": + # for in_name in op.input_arg_names: + # in_var = block.vars[in_name] + # in_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # in_var) + # op_dist_attr.set_input_dist_attr(in_name, in_dist_attr) + # for out_name in op.output_arg_names: + # if out_name == op.desc.output("StepScopes")[0]: + # op_dist_attr.set_output_dims_mapping(out_name, []) + # else: + # out_var = block.vars[out_name] + # out_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # out_var) + # op_dist_attr.set_output_dist_attr(out_name, + # out_dist_attr) + # else: + # for in_name in op.input_arg_names: + # if in_name == "lod_tensor_blocking_queue_0": + # continue + # if in_name not in block.vars: + # in_var = blocks[0].vars[in_name] + # else: + # in_var = block.vars[in_name] + # in_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # in_var) + # op_dist_attr.set_input_dist_attr(in_name, in_dist_attr) + # for out_name in op.output_arg_names: + # if out_name not in block.vars: + # out_var = blocks[0].vars[out_name] + # else: + # out_var = block.vars[out_name] + # out_dist_attr = dist_context.get_tensor_dist_attr_for_program( + # out_var) + # op_dist_attr.set_output_dist_attr(out_name, out_dist_attr) + + # if op.type == "matmul_v2": + # op_dist_attr.impl_type = "matmul_v2" + # for in_name in op_dist_attr.inputs_dist_attrs.keys(): + # in_dist_attr = op_dist_attr.inputs_dist_attrs[in_name] + # if ".w_" in in_name and in_dist_attr.dims_mapping[-1] == 0: + # op_dist_attr.impl_idx = 0 + # else: + # op_dist_attr.impl_idx = 1 + # elif op.type == "fill_constant_batch_size_like": + # op_dist_attr.impl_type = "fill_constant_batch_size_like" + # op_dist_attr.impl_idx = 0 + # else: + # op_dist_attr.impl_type = "default" + # op_dist_attr.impl_idx = 0 + + # dist_context.set_op_dist_attr_for_program(op, op_dist_attr) + # make_data_unshard(train_program, start_program, dist_context) + + completer = Completer(dist_context) + train_program = completer.complete_forward_annotation(train_program) + make_data_unshard(train_program, start_program, dist_context) return train_program, start_program diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py index 036b46470a762..3ddd41158a69e 100755 --- a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py @@ -134,7 +134,6 @@ def test_mlp_serial(self): for op in block.ops: for attr_name in op.attr_names: self.assertTrue(suffix not in attr_name) - # print_program_with_dist_attr(distributed_main_program) self.assertIsNotNone(distributed_startup_program) self.assertIsNotNone(distributed_main_program) diff --git a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py index 574a222ba18c9..a1a853f006c0d 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/dygraph_group_sharded_api.py @@ -32,7 +32,6 @@ momentum_rate = 0.9 l2_decay = 1e-4 batch_size = 100 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -147,4 +146,5 @@ def test_sharding_api(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True) test_sharding_api() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index 82edd1c17a541..58432540d1b82 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -42,7 +42,6 @@ "pp_degree": 1, "sharding_degree": 1 } -fleet.init(is_collective=True, strategy=strategy) np.random.seed(seed) paddle.seed(seed) @@ -225,4 +224,5 @@ def test_dp_stage2(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True, strategy=strategy) test_dp_stage2() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index a7b16bbb75977..cd2d7b3f12765 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -36,6 +36,14 @@ batch_size = 32 linear_size = 1000 +strategy = fleet.DistributedStrategy() +strategy.hybrid_configs = { + "dp_degree": 2, + "mp_degree": 1, + "pp_degree": 1, + "sharding_degree": 1 +} + np.random.seed(seed) paddle.seed(seed) @@ -109,4 +117,5 @@ def test_sharding_stage2_offload(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True, strategy=strategy) test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index cdb1de020f56e..fc4002ef405bd 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -39,7 +39,6 @@ base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -277,4 +276,5 @@ def test_stage2_stage3(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True) test_stage2_stage3() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py index 2cb327a29a3da..763a7a8b97fdd 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py @@ -34,7 +34,6 @@ base_lr = 0.1 momentum_rate = 0.9 l2_decay = 1e-4 -fleet.init(is_collective=True) class MLP(fluid.Layer): @@ -199,4 +198,5 @@ def test_stage3_offload(): if __name__ == '__main__': with _test_eager_guard(): pass + fleet.init(is_collective=True) test_stage3_offload() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py index bafc4707c4ad9..361fcbf9c73f5 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_spec_names.py @@ -16,8 +16,6 @@ from paddle.nn import Layer import numpy as np import unittest -from paddle.fluid.framework import _enable_legacy_dygraph -_enable_legacy_dygraph() class Net(Layer): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py index 4d412f2a79977..d7b15a442957d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py @@ -85,23 +85,6 @@ def set_op_attrs(self): self.attrs['dtype'] = 'float32' -class TestDisableTransferCast(TestEnableFp16): - def set_data_feed(self): - data = np.random.uniform(size=[1, 3, 3, 3]) - self.feed_fp32 = {'x': data.astype(np.float32)} - self.feed_fp16 = {'x': data.astype(np.float16)} - - def set_op_attrs(self): - self.attrs = {} - self.attrs['dtype'] = 'float32' - - def run_model(self, exec_mode): - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.set_graph_config(is_training=self.is_training) - ipu_strategy.set_options({"transfer_cast_op": False}) - self.run_op_test(exec_mode) - - class TestCase2(TestBase): def set_data_feed(self): self.feed_fp32 = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 661fbbc7759c6..4717dfa1eab52 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -143,5 +143,6 @@ if (WITH_MKLDNN) set_tests_properties(test_mkldnn_conv_mish_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_mkldnn_fc_mish_fuse_pass PROPERTIES TIMEOUT 300) set_tests_properties(test_mkldnn_fc_elementwise_add_fuse_pass PROPERTIES TIMEOUT 120) + set_tests_properties(test_mkldnn_conv_affine_channel_fuse_pass PROPERTIES TIMEOUT 60) endif() endif() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py new file mode 100644 index 0000000000000..a35b75e69f812 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_affine_channel_fuse_pass.py @@ -0,0 +1,158 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest, IgnoreReasons +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + +import hypothesis +from hypothesis import given, settings, seed, example, assume, reproduce_failure +import hypothesis.strategies as st + + +class TestConvAffineChannelFusePass(PassAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_config(self, draw): + padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) + groups = draw(st.integers(min_value=1, max_value=3)) + data_format = draw(st.sampled_from(["NCHW", "NHWC"])) + axis = draw(st.sampled_from([1])) + filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4 + filter_size = draw(st.integers(min_value=1, max_value=4)) + in_channel = groups * filter_channel + out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4 + out_channel = groups * out_channel_factor + batch_size = draw(st.integers(min_value=1, max_value=4)) + dilations = draw( + st.lists( + st.integers( + min_value=1, max_value=2), min_size=2, max_size=2)) + paddings = draw( + st.lists( + st.integers( + min_value=0, max_value=2), min_size=2, max_size=2)) + strides = draw( + st.lists( + st.integers( + min_value=1, max_value=2), min_size=2, max_size=2)) + has_bias = draw(st.booleans()) + + x_shape = [ + batch_size, in_channel, 64, 64 + ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel] + w_shape = [out_channel, filter_channel, filter_size, filter_size] + scale_shape = [out_channel] + bias_shape = [out_channel] + + def generate_input(): + return np.random.random(x_shape).astype(np.float32) + + def generate_weight(): + return np.random.random(w_shape).astype(np.float32) + + def generate_bias(): + return np.random.random(bias_shape).astype(np.float32) + + def generate_scale_bias(): + return np.random.random(bias_shape).astype(np.float32) + + conv2d_op = OpConfig( + "conv2d", + inputs={ + "Input": ["input_data"], + "Filter": ["conv2d_weight"], + }, + outputs={"Output": ["conv_output"]}, + data_format=data_format, + dilations=dilations, + padding_algorithm=padding_algorithm, + groups=groups, + paddings=paddings, + strides=strides, + has_bias=has_bias, + is_test=True) + ac_op = OpConfig( + "affine_channel", + inputs={ + "X": ["conv_output"], + "Scale": ["affine_channel_scale"], + "Bias": ["affine_channel_bias"] + }, + outputs={"Out": ["affine_channel_ouput"]}, + data_layout=data_format) + if has_bias == True: + conv2d_op.inputs["Bias"] = ["conv2d_bias"] + ops = [conv2d_op, ac_op] + + program_config = ProgramConfig( + ops=ops, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input)), + }, + weights={ + "conv2d_weight": + TensorConfig(data_gen=partial(generate_weight)), + "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)), + "affine_channel_scale": + TensorConfig(data_gen=partial(generate_scale_bias)), + "affine_channel_bias": + TensorConfig(data_gen=partial(generate_scale_bias)), + }, + outputs=["affine_channel_ouput"]) + if has_bias == True: + program_config.weights["conv2d_bias"] = TensorConfig( + data_gen=partial(generate_bias)) + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) + + def add_ignore_pass_case(self): + # If the problem has been fixed, the judgment + # in is_program_valid needs to be deleted!!! + def teller1(program_config, predictor_config): + if program_config.ops[0].attrs['data_format'] == "NHWC": + return True + return False + + # mkldnn Output has diff with bias! + def teller2(program_config, predictor_config): + return predictor_config.mkldnn_enabled() and program_config.ops[ + 0].attrs['has_bias'] == True + + self.add_ignore_check_case( + teller1, IgnoreReasons.PASS_ACCURACY_ERROR, + "The output format of conv2d is wrong when data_format attribute is NHWC, \ + because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)." + ) + + self.add_ignore_check_case( + teller2, IgnoreReasons.PASS_ACCURACY_ERROR, + "Currently mkldnn Output has diff with bias!") + + def test(self): + self.run_and_statis( + quant=False, + passes=["conv_affine_channel_mkldnn_fuse_pass"], ) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py new file mode 100644 index 0000000000000..d729efbb0fb60 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fill_constant_mkldnn_op.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool +import paddle + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestFillConstant2DOneDNNOp(OpTest): + def setUp(self): + self.op_type = "fill_constant" + self.dtype = np.float32 + + self.shape_tensor_list = None + self.shape_tensor = None + self.str_value = "" + real_shape = [] + self.value = 0.1 + + self.set_inputs() + self.set_attrs() + + if 'value' in self.attrs: + self.value = self.attrs['value'] + if self.str_value != "": + self.value = float(self.str_value) + if 'ValueTensor' in self.inputs: + self.value = self.inputs['ValueTensor'] + + if 'shape' in self.attrs: + real_shape = self.attrs['shape'] + if 'ShapeTensor' in self.inputs: + real_shape = list(self.inputs['ShapeTensor']) + if 'ShapeTensorList' in self.inputs: + real_shape = [] + for shape_tensor in self.inputs['ShapeTensorList']: + real_shape.append(shape_tensor[1].item()) + + self.outputs = {'Out': np.full(real_shape, self.value)} + + def set_inputs(self): + self.inputs = {} + + def set_attrs(self): + self.attrs = {'shape': (3, 5), 'use_mkldnn': True, 'value': self.value} + + def test_check_output(self): + self.check_output() + + +class TestFillZerosLike4DShapeTensorPriorityOneDNNOp( + TestFillConstant2DOneDNNOp): + def set_inputs(self): + self.inputs = {'ShapeTensor': np.array([5, 6, 7, 8]).astype("int32")} + + +class TestFillZerosLike4DShapeTensorListPriorityOneDNNOp( + TestFillConstant2DOneDNNOp): + def set_inputs(self): + shape = (4, 5, 6, 7) + self.shape_tensor_list = [] + for index, elem in enumerate(shape): + self.shape_tensor_list.append(("x" + str(index), np.ones( + (1)).astype('int32') * elem)) + + self.inputs = {'ShapeTensorList': self.shape_tensor_list} + + +class TestFillZerosLike2DStringValueInfOneDNNOp(TestFillConstant2DOneDNNOp): + def set_attrs(self): + self.str_value = "inf" + self.attrs = {'shape': (10, 13), 'use_mkldnn': True, 'str_value': "inf"} + + +class TestFillZerosLike2DStringValueMinusInfOneDNNOp( + TestFillConstant2DOneDNNOp): + def set_attrs(self): + self.str_value = "-inf" + self.attrs = { + 'shape': (10, 13), + 'use_mkldnn': True, + 'str_value': "-inf" + } + + +class TestFillZerosLike2DStringValueFloatOneDNNOp(TestFillConstant2DOneDNNOp): + def set_attrs(self): + self.str_value = "0.123" + self.attrs = { + 'shape': (10, 13), + 'use_mkldnn': True, + 'str_value': "0.123" + } + + +class TestFillZerosLike2DValueTensorPriorityOneDNNOp( + TestFillZerosLike2DStringValueFloatOneDNNOp): + def set_inputs(self): + self.inputs = {'ValueTensor': np.atleast_1d(2.25).astype("float32")} + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 25701b797ec4a..4e59e41b60851 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -67,6 +67,8 @@ def config(self): self.y_shape = (100, ) self.trans_x = False self.trans_y = False + self._cpu_only = True + self.use_mkldnn = True def set_inputs(self, x, y): self.inputs = {'X': x, 'Y': y} diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py new file mode 100644 index 0000000000000..f30a391f65385 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_adam_op_mlu.py @@ -0,0 +1,303 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from test_adam_op import adam_step + +paddle.enable_static() +SEED = 2022 + + +class TestAdam(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + param_out, moment1_out, \ + moment2_out = adam_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamWithEpsilonTensor(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + } + + self.attrs = {'epsilon': epsilon} + + param_out, moment1_out, \ + moment2_out = adam_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamOpWithSkipUpdate(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + self.attrs = {'epsilon': epsilon} + + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamOpWithGlobalBetaPow(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + } + + attributes = {'epsilon': epsilon} + + param_out, moment1_out, \ + moment2_out = adam_step(self.inputs, attributes) + + self.attrs = {'use_global_beta_pow': True} + + # use_global_beta_pow=True, Beta1PowOut and Beta2PowOut are empty. + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([]), + 'Beta2PowOut': np.array([]) + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestNet(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + adam = fluid.optimizer.Adam(learning_rate=0.01) + adam.minimize(loss) + + if run_mlu: + place = paddle.device.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + mlu_pred, mlu_loss = self._test(True) + cpu_pred, cpu_loss = self._test(False) + self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py new file mode 100644 index 0000000000000..d2827725a2058 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_adamw_op_mlu.py @@ -0,0 +1,250 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from test_adam_op import adamw_step + +paddle.enable_static() +SEED = 2022 + + +class TestAdamW(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (105, 102)).astype("float32") + grad = np.random.uniform(-1, 1, (105, 102)).astype("float32") + moment1 = np.random.uniform(-1, 1, (105, 102)).astype("float32") + # The second moment is positive + moment2 = np.random.random((105, 102)).astype("float32") + + learning_rate = 0.5 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "coeff": 0.9, + "with_decay": True + } + + param_out, moment1_out, \ + moment2_out = adamw_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamOpWithSkipUpdate(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": True} + + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestAdamOpWithoutDecay(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32"), + 'Beta1Tensor': np.array([beta1]).astype("float32"), + 'Beta2Tensor': np.array([beta2]).astype("float32"), + 'EpsilonTensor': np.array([epsilon]).astype("float32"), + "SkipUpdate": np.array([True]).astype("bool"), + } + + self.attrs = {'epsilon': epsilon, "coeff": 0.02, "with_decay": False} + + self.outputs = { + 'Moment1Out': moment1, + 'Moment2Out': moment2, + 'ParamOut': param, + 'Beta1PowOut': self.inputs['Beta1Pow'], + 'Beta2PowOut': self.inputs['Beta2Pow'], + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestNet(unittest.TestCase): + def _test(self, run_mlu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + adam = paddle.optimizer.AdamW(learning_rate=0.01, weight_decay=0.02) + adam.minimize(loss) + + if run_mlu: + place = paddle.device.MLUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_mlu(self): + mlu_pred, mlu_loss = self._test(True) + cpu_pred, cpu_loss = self._test(False) + self.assertTrue(np.allclose(mlu_pred, cpu_pred, rtol=1e-3)) + self.assertTrue(np.allclose(mlu_loss, cpu_loss, rtol=1e-3)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py new file mode 100644 index 0000000000000..85302ad76da8b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_op_mlu.py @@ -0,0 +1,52 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() +SEED = 2022 + + +class TestAssign(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "assign" + self.init_dtype() + + x = np.random.random([3, 3]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + self.outputs = {'Out': x} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py new file mode 100644 index 0000000000000..5ee9d369e0fd9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_assign_value_op_mlu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy +import sys +sys.path.append("..") + +import op_test +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.fluid.layers as layers + +paddle.enable_static() +numpy.random.seed(2022) + + +class TestAssignValueMLUOp(op_test.OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "assign_value" + self.inputs = {} + self.attrs = {} + self.init_data() + + self.attrs["shape"] = self.value.shape + self.attrs["dtype"] = framework.convert_np_dtype_to_dtype_( + self.value.dtype) + self.outputs = {"Out": self.value} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_data(self): + self.value = numpy.random.random(size=(2, 5)).astype(numpy.float32) + self.attrs["fp32_values"] = [float(v) for v in self.value.flat] + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestAssignValueMLUOp2(TestAssignValueMLUOp): + def init_data(self): + self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32) + self.attrs["int32_values"] = [int(v) for v in self.value.flat] + + +class TestAssignValueMLUOp3(TestAssignValueMLUOp): + def init_data(self): + self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64) + self.attrs["int64_values"] = [int(v) for v in self.value.flat] + + +class TestAssignValueMLUOp4(TestAssignValueMLUOp): + def init_data(self): + self.value = numpy.random.choice( + a=[False, True], size=(2, 5)).astype(numpy.bool) + self.attrs["bool_values"] = [int(v) for v in self.value.flat] + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py new file mode 100644 index 0000000000000..8b32692020cbf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_layer_norm_op_mlu.py @@ -0,0 +1,309 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from operator import mul +import paddle.fluid.core as core +import paddle.fluid as fluid +import paddle.nn.functional as F +from functools import reduce +import sys +sys.path.append('..') +from op_test import _set_use_system_allocator +from paddle.fluid import Program, program_guard +from paddle.fluid.contrib.mixed_precision.fp16_utils import _keep_layer_norm_scale_bias_to_fp32 +from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad + +paddle.enable_static() + +np.random.random(123) + +_set_use_system_allocator(True) + + +class TestLayerNormOp(unittest.TestCase): + def setUp(self): + self.use_cudnn = True + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg) + + def check_forward_backward(self, + shape, + begin_norm_axis, + has_scale=True, + has_bias=True, + y_grad_scale=1.0, + use_mkldnn=False): + def test_with_place(place, + shape, + begin_norm_axis, + use_mkldnn=use_mkldnn): + # attr + epsilon = 0.00001 + x_shape = shape + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + scale_shape = [D] + + np.random.seed(123) + x = np.random.random_sample(x_shape).astype(np.float32) + scale = np.random.random_sample(scale_shape).astype( + np.float32) if has_scale else None + bias = np.random.random_sample(scale_shape).astype( + np.float32) if has_bias else None + y_grad = (np.random.random_sample(x_shape) * + y_grad_scale).astype(np.float32) + + # reference forward & backward + y, mean, variance = _reference_layer_norm_naive( + x, scale, bias, epsilon, begin_norm_axis) + x_grad, scale_grad, bias_grad = _reference_layer_norm_grad( + x, y_grad, scale, bias, mean, variance, begin_norm_axis) + + var_dict = locals() + var_dict['y@GRAD'] = y_grad + var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD'] + if has_scale: + var_names += ['scale'] + if has_bias: + var_names += ['bias'] + ground_truth = {name: var_dict[name] for name in var_names} + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, + dtype='float32', + shape=ground_truth[name].shape) + inputs = {"X": block.var('x')} + fetch_list = [ + 'y', + 'mean', + 'variance', + 'x@GRAD', + ] + if has_scale: + inputs["Scale"] = block.var('scale') + fetch_list += ['scale@GRAD'] + if has_bias: + inputs["Bias"] = block.var('bias') + fetch_list += ['bias@GRAD'] + layer_norm_op = block.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": block.var('y'), + "Mean": block.var('mean'), # share the same memory + "Variance": + block.var('variance'), # share the same memory + }, + attrs={ + "epsilon": epsilon, + "begin_norm_axis": begin_norm_axis, + "use_mkldnn": use_mkldnn + }) + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + layer_norm_op.desc, set(), []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + program._sync_with_cpp() + exe = fluid.Executor(place) + out = exe.run(program, + feed={ + name: var_dict[name] + for name in ['x', 'scale', 'bias', 'y@GRAD'] + }, + fetch_list=fetch_list) + + self.__assert_close(y, out[0], "y") + self.__assert_close(mean, out[1], "mean") + self.__assert_close(1 / np.sqrt(variance), out[2], "variance", + 1e-3) + self.__assert_close(x_grad, out[3], "x_grad") + if has_scale: + self.__assert_close(scale_grad, + out[fetch_list.index('scale@GRAD')], + "scale_grad", 1e-3) + if has_bias: + self.__assert_close(bias_grad, + out[fetch_list.index('bias@GRAD')], + "bias_grad") + + test_with_place(self.place, shape, begin_norm_axis) + + def test_check_forward_backward_with_scale_and_bias(self): + self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward( + shape=[2, 3, 4, 5], + begin_norm_axis=1, + has_scale=False, + has_bias=True) + self.check_forward_backward( + shape=[2, 3, 4, 5], + begin_norm_axis=1, + has_scale=True, + has_bias=False) + self.check_forward_backward( + shape=[2, 3, 4, 5], + begin_norm_axis=1, + has_scale=False, + has_bias=False) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) + self.check_forward_backward( + shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1) + self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2) + self.check_forward_backward( + shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=False, + has_bias=True, + y_grad_scale=0.1) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=True, + has_bias=False, + y_grad_scale=0.1) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=False, + has_bias=False, + y_grad_scale=0.1) + self.check_forward_backward( + shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True) + + +class TestLayerNormAPI(unittest.TestCase): + def test_case(self): + x = fluid.layers.data( + name='x', + shape=[64, 32, 256], + dtype='float32', + append_batch_size=False) + x = fluid.layers.layer_norm( + x, + scale=True, + shift=True, + begin_norm_axis=1, + epsilon=1e-05, + param_attr=None, + bias_attr=None) + x = fluid.layers.layer_norm( + x, + scale=False, + shift=False, + begin_norm_axis=1, + epsilon=1e-05, + param_attr=None, + bias_attr=None) + x = fluid.layers.layer_norm( + x, + scale=False, + shift=False, + begin_norm_axis=1, + epsilon=1e-05, + param_attr="scale", + bias_attr="shift") + + +class TestDygraphLayerNormAPIError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + paddle.enable_static() + + layer_norm = fluid.LayerNorm([32, 32]) + # the input of LayerNorm must be Variable. + x1 = np.random.random((3, 32, 32)).astype('float32') + self.assertRaises(TypeError, layer_norm, x1) + + # the input dtype of LayerNorm must be float32 or float16 + x2 = fluid.layers.data(name='x2', shape=[3, 32, 32], dtype="int32") + self.assertRaises(TypeError, layer_norm, x2) + + +class TestFP16ScaleBiasLayerNorm(unittest.TestCase): + def check_main(self, x_np, weight_np, bias_np, dtype): + paddle.disable_static() + + weight_np = weight_np.astype(dtype) + bias_np = bias_np.astype(dtype) + + x = paddle.to_tensor(x_np) + weight = paddle.to_tensor(weight_np) + bias = paddle.to_tensor(bias_np) + x.stop_gradient = False + weight.stop_gradient = False + bias.stop_gradient = False + y = F.layer_norm(x, x.shape[1:], weight, bias) + x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) + y_np = y.numpy().astype('float32') + x_g_np = x_g.numpy().astype('float32') + w_g_np = w_g.numpy().astype('float16') + b_g_np = b_g.numpy().astype('float32') + + paddle.enable_static() + return y_np, x_g_np, w_g_np, b_g_np + + def test_main(self): + x_np = np.random.random([10, 20]).astype('float16') + weight_np = np.random.random([20]).astype('float16') + bias_np = np.random.random([20]).astype('float16') + + y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main( + x_np, weight_np, bias_np, 'float16') + y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main( + x_np, weight_np, bias_np, 'float32') + + def assert_equal(x, y): + self.assertTrue(np.array_equal(x, y)) + + assert_equal(y_np_1, y_np_2) + assert_equal(x_g_np_1, x_g_np_2) + assert_equal(w_g_np_1, w_g_np_2) + assert_equal(b_g_np_1, b_g_np_2) + + +class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): + def test_main(self): + self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) + _keep_layer_norm_scale_bias_to_fp32(False) + self.assertFalse(_keep_layer_norm_scale_bias_to_fp32()) + _keep_layer_norm_scale_bias_to_fp32(True) + self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index 570551e82646f..955f2117778f0 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -135,6 +135,34 @@ def test_grad(self): self.func(p) +class TestAbsDoubleGradCheck(unittest.TestCase): + def abs_wrapper(self, x): + return paddle.abs(x[0]) + + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 0.0005 + dtype = np.float64 + x = layers.data('x', shape, False, dtype=dtype) + x.persistable = True + y = paddle.abs(x) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + x_arr[np.abs(x_arr) < 0.005] = 0.002 + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + gradient_checker.double_grad_check_for_dygraph( + self.abs_wrapper, [x], y, x_init=x_arr, place=place) + + def test_grad(self): + paddle.enable_static() + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + class TestReluDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index d2eef785f6e07..3e2f112e964bb 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -54,8 +54,8 @@ def adamw_step(inputs, attributes): moment1_out = beta1 * moment1 + (1 - beta1) * grad moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) - lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) - param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon)) + denom = (np.sqrt(moment2_out) / np.sqrt(1.0 - beta2_pow)) + epsilon + param_out = param + ((moment1_out / denom) * (-(lr / (1.0 - beta1_pow)))) return param_out, moment1_out, moment2_out @@ -314,16 +314,16 @@ def simple_lr_setting(param, decay_rate, n_layers): "core is not compiled with CUDA") class TestAdamWOpLayerwiseLR(TestAdamWOp): def setUp(self): - random.seed(2021) - np.random.seed(2021) - paddle.seed(2021) + random.seed(2022) + np.random.seed(2022) + paddle.seed(2022) def test_adamw_op_dygraph(self): paddle.disable_static() - value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_tensor(value) - linear1 = paddle.nn.Linear(13, 8) - linear2 = paddle.nn.Linear(8, 5) + linear1 = paddle.nn.Linear( + 13, 8, bias_attr=paddle.nn.initializer.Constant(value=1.0)) + linear2 = paddle.nn.Linear( + 8, 5, bias_attr=paddle.nn.initializer.Constant(value=1.0)) # fix the linear name, simple_lr_setting function will use the name linear1.weight.name = "linear_1.w_0" @@ -331,33 +331,103 @@ def test_adamw_op_dygraph(self): linear2.weight.name = "linear_2.w_0" linear2.bias.name = "linear_2.b_0" + fc1_w = np.array(linear1.weight) + fc1_w_mon1 = np.zeros_like(fc1_w) + fc1_w_mon2 = np.zeros_like(fc1_w) + fc1_b = np.array(linear1.bias) + fc1_b_mon1 = np.zeros_like(fc1_b) + fc1_b_mon2 = np.zeros_like(fc1_b) + + fc2_w = np.array(linear2.weight) + fc2_w_mon1 = np.zeros_like(fc2_w) + fc2_w_mon2 = np.zeros_like(fc2_w) + fc2_b = np.array(linear2.bias) + fc2_b_mon1 = np.zeros_like(fc2_b) + fc2_b_mon2 = np.zeros_like(fc2_b) + simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, n_layers=2) + learning_rate = 0.001 + weight_decay = 0.01 + beta1 = 0.9 + beta2 = 0.999 - adam = paddle.optimizer.AdamW( - learning_rate=0.01, + opt = paddle.optimizer.AdamW( + learning_rate=learning_rate, parameters=[{ 'params': linear1.parameters() }, { 'params': linear2.parameters(), }], apply_decay_param_fun=lambda name: True, - weight_decay=0.01, + weight_decay=weight_decay, lr_ratio=simple_lr_fun) - loss_ref = np.array( - [-1.7267396, -2.81524, -3.9250019, -5.05954, -6.2272625]) + def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t): + np_inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1**t]).astype("float32"), + 'Beta2Pow': np.array([beta2**t]).astype("float32") + } + + np_attrs = { + 'epsilon': 1e-8, + 'beta1': beta1, + 'beta2': beta2, + "lr_ratio": lr_ratio, + "coeff": weight_decay, + "with_decay": True + } + param_out, moment1_out, moment2_out = adamw_step(np_inputs, + np_attrs) + return param_out, moment1_out, moment2_out + for i in range(5): + a = paddle.to_tensor( + np.random.uniform(-1, 1, (2, 13)).astype("float32")) a1 = linear1(a) out = linear2(a1) out = paddle.mean(out) out.backward() - adam.step() - adam.clear_gradients() - np.testing.assert_allclose(out[0].numpy(), loss_ref[i], rtol=1e-6) + + fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output( + fc1_w, + np.array(linear1.weight.grad), fc1_w_mon1, fc1_w_mon2, + simple_lr_fun(linear1.weight), i + 1) + fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output( + fc1_b, + np.array(linear1.bias.grad), fc1_b_mon1, fc1_b_mon2, + simple_lr_fun(linear1.bias), i + 1) + fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output( + fc2_w, + np.array(linear2.weight.grad), fc2_w_mon1, fc2_w_mon2, + simple_lr_fun(linear2.weight), i + 1) + fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output( + fc2_b, + np.array(linear2.bias.grad), fc2_b_mon1, fc2_b_mon2, + simple_lr_fun(linear2.bias), i + 1) + + opt.step() + opt.clear_gradients() + + np.testing.assert_allclose(linear1.weight.numpy(), fc1_w, rtol=1e-6) + np.testing.assert_allclose(linear1.bias.numpy(), fc1_b, rtol=1e-6) + np.testing.assert_allclose(linear2.weight.numpy(), fc2_w, rtol=1e-6) + np.testing.assert_allclose(linear2.bias.numpy(), fc2_b, rtol=1e-6) def test_adamw_op(self): paddle.enable_static() place = fluid.CUDAPlace(0) + + learning_rate = 0.0001 + beta1 = 0.85 + beta2 = 0.95 + weight_decay = 0.01 + epsilon = 1e-8 + train_prog = fluid.Program() startup = fluid.Program() with fluid.program_guard(train_prog, startup): @@ -365,42 +435,121 @@ def test_adamw_op(self): x = fluid.data(name='x', shape=[None, 10], dtype='float32') y = fluid.data(name='y', shape=[None, 1], dtype='float32') - fc1 = fluid.layers.fc(input=x, size=32, act=None) - prediction = fluid.layers.fc(input=fc1, size=1, act=None) - cost = fluid.layers.square_error_cost(input=prediction, label=y) + weight_attr1 = paddle.framework.ParamAttr(name="linear_0.w_0") + bias_attr1 = paddle.framework.ParamAttr( + name="linear_0.b_0", + initializer=paddle.nn.initializer.Constant(value=1.0)) + weight_attr2 = paddle.framework.ParamAttr(name="linear_1.w_0") + bias_attr2 = paddle.framework.ParamAttr( + name="linear_1.b_0", + initializer=paddle.nn.initializer.Constant(value=1.0)) + linear1 = paddle.nn.Linear( + 10, 32, weight_attr=weight_attr1, bias_attr=bias_attr1) + linear2 = paddle.nn.Linear( + 32, 1, weight_attr=weight_attr2, bias_attr=bias_attr2) + + out = linear1(x) + out = linear2(out) + + fc1_w_mon1 = np.zeros((linear1.weight.shape)).astype("float32") + fc1_w_mon2 = np.zeros((linear1.weight.shape)).astype("float32") + fc1_b_mon1 = np.zeros((linear1.bias.shape)).astype("float32") + fc1_b_mon2 = np.zeros((linear1.bias.shape)).astype("float32") + fc2_w_mon1 = np.zeros((linear2.weight.shape)).astype("float32") + fc2_w_mon2 = np.zeros((linear2.weight.shape)).astype("float32") + fc2_b_mon1 = np.zeros((linear2.bias.shape)).astype("float32") + fc2_b_mon2 = np.zeros((linear2.bias.shape)).astype("float32") + + cost = fluid.layers.square_error_cost(input=out, label=y) avg_cost = fluid.layers.mean(cost) simple_lr_fun = partial( simple_lr_setting, decay_rate=0.8, n_layers=2) - beta1 = fluid.layers.create_global_var( - shape=[1], value=0.85, dtype='float32', persistable=True) - beta2 = fluid.layers.create_global_var( - shape=[1], value=0.95, dtype='float32', persistable=True) - betas = [beta1, beta2] opt = paddle.optimizer.AdamW( - learning_rate=1e-5, + learning_rate=learning_rate, beta1=beta1, beta2=beta2, - weight_decay=0.01, - epsilon=1e-8, + weight_decay=weight_decay, + epsilon=epsilon, lr_ratio=simple_lr_fun) opt.minimize(avg_cost) + def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t): + np_inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1**t]).astype("float32"), + 'Beta2Pow': np.array([beta2**t]).astype("float32") + } + + np_attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "lr_ratio": lr_ratio, + "coeff": weight_decay, + "with_decay": True + } + param_out, moment1_out, moment2_out = adamw_step(np_inputs, + np_attrs) + return param_out, moment1_out, moment2_out + + fetch_list1 = [ + "linear_0.w_0", "linear_0.b_0", "linear_1.w_0", "linear_1.b_0" + ] + fetch_list2 = [ + "linear_0.w_0", "linear_0.w_0@GRAD", "linear_0.b_0", + "linear_0.b_0@GRAD", "linear_1.w_0", "linear_1.w_0@GRAD", + "linear_1.b_0", "linear_1.b_0@GRAD" + ] + exe = fluid.Executor(place) exe.run(startup) + test_prog = train_prog.clone(for_test=True) - loss_ref = np.array( - [0.33895183, 0.3159437, 0.19472016, 0.17764759, 0.1520702]) for i in range(5): inputs = np.random.random(size=[8, 10]).astype('float32') outputs = np.random.random(size=[8, 1]).astype('float32') - rets = exe.run(train_prog, - feed={"x": inputs, - "y": outputs}, - fetch_list=[avg_cost]) - assert rets[0] is not None - np.testing.assert_allclose(rets[0], loss_ref[i], rtol=1e-6) + + param = exe.run(test_prog, + feed={"x": inputs, + "y": outputs}, + fetch_list=fetch_list1) + params_and_gras = exe.run(train_prog, + feed={"x": inputs, + "y": outputs}, + fetch_list=fetch_list2) + + fc1_w = param[0] + fc1_w_grad = params_and_gras[1] + fc1_b = param[1] + fc1_b_grad = params_and_gras[3] + fc2_w = param[2] + fc2_w_grad = params_and_gras[5] + fc2_b = param[3] + fc2_b_grad = params_and_gras[7] + + fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output( + fc1_w, fc1_w_grad, fc1_w_mon1, fc1_w_mon2, + simple_lr_fun(linear1.weight), i + 1) + fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output( + fc1_b, fc1_b_grad, fc1_b_mon1, fc1_b_mon2, + simple_lr_fun(linear1.bias), i + 1) + fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output( + fc2_w, fc2_w_grad, fc2_w_mon1, fc2_w_mon2, + simple_lr_fun(linear2.weight), i + 1) + fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output( + fc2_b, fc2_b_grad, fc2_b_mon1, fc2_b_mon2, + simple_lr_fun(linear2.bias), i + 1) + + np.testing.assert_allclose(params_and_gras[0], fc1_w, rtol=1e-6) + np.testing.assert_allclose(params_and_gras[2], fc1_b, rtol=1e-6) + np.testing.assert_allclose(params_and_gras[4], fc2_w, rtol=1e-6) + np.testing.assert_allclose(params_and_gras[6], fc2_b, rtol=1e-6) paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index a33874a330a21..9888d2c68f195 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -332,7 +332,6 @@ def test_mlp_pp_diff_process_mesh(self): resharder = Resharder(dist_main_prog, dist_startup_prog, rank_id, dist_context, dist_params_grads) resharder.reshard() - print_program_with_dist_attr(dist_main_prog, dist_context) # check send and recv result self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) diff --git a/python/paddle/fluid/tests/unittests/test_corr.py b/python/paddle/fluid/tests/unittests/test_corr.py new file mode 100644 index 0000000000000..99fd21c047b07 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_corr.py @@ -0,0 +1,122 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import unittest +import numpy as np +import six +import paddle +import warnings + + +def numpy_corr(np_arr, rowvar=True, dtype='float64'): + return np.corrcoef(np_arr, rowvar=rowvar, dtype=dtype) + + +class Corr_Test(unittest.TestCase): + def setUp(self): + self.shape = [4, 5] + + def test_tensor_corr_default(self): + typelist = ['float64', 'float32'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + + for dtype in typelist: + np_arr = np.random.rand(*self.shape).astype(dtype) + tensor = paddle.to_tensor(np_arr, place=p) + corr = paddle.linalg.corrcoef(tensor) + np_corr = numpy_corr(np_arr, rowvar=True, dtype=dtype) + if dtype == 'float32': + self.assertTrue( + np.allclose( + np_corr, corr.numpy(), atol=1.e-5)) + else: + self.assertTrue(np.allclose(np_corr, corr.numpy())) + + def test_tensor_corr_rowvar(self): + typelist = ['float64', 'float32'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + + for dtype in typelist: + np_arr = np.random.rand(*self.shape).astype(dtype) + tensor = paddle.to_tensor(np_arr, place=p) + corr = paddle.linalg.corrcoef(tensor, rowvar=False) + np_corr = numpy_corr(np_arr, rowvar=False, dtype=dtype) + if dtype == 'float32': + self.assertTrue( + np.allclose( + np_corr, corr.numpy(), atol=1.e-5)) + else: + self.assertTrue(np.allclose(np_corr, corr.numpy())) + + +# Input(x) only support N-D (1<=N<=2) tensor +class Corr_Test2(Corr_Test): + def setUp(self): + self.shape = [10] + + +class Corr_Test3(Corr_Test): + def setUp(self): + self.shape = [4, 5] + + +# Input(x) only support N-D (1<=N<=2) tensor +class Corr_Test4(unittest.TestCase): + def setUp(self): + self.shape = [2, 5, 2] + + def test_errors(self): + def test_err(): + np_arr = np.random.rand(*self.shape).astype('float64') + tensor = paddle.to_tensor(np_arr) + covrr = paddle.linalg.corrcoef(tensor) + + self.assertRaises(ValueError, test_err) + + +# test unsupported complex input +class Corr_Comeplex_Test(unittest.TestCase): + def setUp(self): + self.dtype = 'complex128' + + def test_errors(self): + paddle.enable_static() + x1 = fluid.data(name=self.dtype, shape=[2], dtype=self.dtype) + self.assertRaises(TypeError, paddle.linalg.corrcoef, x=x1) + paddle.disable_static() + + +class Corr_Test5(Corr_Comeplex_Test): + def setUp(self): + self.dtype = 'complex64' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py index e664face0483a..0a51045dee5e1 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_group_sharded_api.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -24,9 +25,10 @@ class TestDygraphGroupSharded(TestMultipleGpus): # check group sharded logic as well as the accuracy with single mode def test_dygraph_group_sharded(self): - self.run_mnist_2gpu('dygraph_group_sharded_api.py') + self.run_mnist_2gpu('dygraph_group_sharded_api.py', eager_mode=False) self.run_mnist_2gpu('dygraph_group_sharded_api_eager.py') if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py index deb180a2fe179..50e1985138610 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_optimizer_stage2.py @@ -24,7 +24,8 @@ class TestDygraphShardingOptimizerStage2(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_optimizer_stage2(self): - self.run_mnist_2gpu('dygraph_sharding_optimizer_stage2.py') + self.run_mnist_2gpu( + 'dygraph_sharding_optimizer_stage2.py', eager_mode=False) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py index b7a5f9c9701c1..866577ea7aa8c 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -25,12 +26,14 @@ class TestDygraphShardingStage2(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage2(self): self.run_mnist_2gpu('dygraph_group_sharded_stage2.py') - self.run_mnist_2gpu('dygraph_sharding_stage2.py') + self.run_mnist_2gpu('dygraph_sharding_stage2.py', eager_mode=False) def test_dygraph_sharding_stage2_offload(self): self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py') - self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py') + self.run_mnist_2gpu( + 'dygraph_sharding_stage2_offload.py', eager_mode=False) if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py index f69b52cae528a..c1f5e06f42b53 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -25,12 +26,14 @@ class TestDygraphShardingStage3(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_dygraph_sharding_stage3(self): self.run_mnist_2gpu('dygraph_group_sharded_stage3.py') - self.run_mnist_2gpu('dygraph_sharding_stage3.py') + self.run_mnist_2gpu('dygraph_sharding_stage3.py', eager_mode=False) def test_dygraph_sharding_stage3_offload(self): self.run_mnist_2gpu('dygraph_group_sharded_stage3_offload.py') - self.run_mnist_2gpu('dygraph_sharding_stage3_offload.py') + self.run_mnist_2gpu( + 'dygraph_sharding_stage3_offload.py', eager_mode=False) if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py index 63acaf6396913..c58d46edde753 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py @@ -464,5 +464,19 @@ def test_static_graph(self): self.check_output_equal(a, e) +class TestStaticGraphShape(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + def tearDown(self): + paddle.disable_static() + + def test_shape(self): + A = paddle.static.data(name='x', shape=[-1]) + B = paddle.static.data(name='y', shape=[384]) + C = paddle.einsum('i,d->id', A, B) + self.assertEqual(C.shape, (-1, 384)) + + if __name__ == "__main__": - u + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py new file mode 100644 index 0000000000000..8a8e74e28ec72 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_heaviside_op.py @@ -0,0 +1,169 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest +import paddle + + +class TestElementwiseOp(OpTest): + def setUp(self): + self.op_type = "elementwise_heaviside" + x = np.random.random((13, 17)).astype("float64") + y = np.random.random((13, 17)).astype("float64") + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': np.heaviside(self.inputs['X'], self.inputs['Y'])} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + +class TestHeavisideBroadcast(unittest.TestCase): + def setUp(self): + self.input_1 = np.random.rand(2, 100, 13, 17).astype("float32") + self.input_2 = np.random.rand(100, 13, 17).astype("float32") + self.input_3 = np.random.rand(100, 13, 1).astype("float32") + self.input_4 = np.random.rand(13, 17).astype("float32") + self.input_5 = np.random.rand(1).astype("float32") + + self.np_expected1 = np.heaviside(self.input_1, self.input_2) + self.np_expected2 = np.heaviside(self.input_2, self.input_3) + self.np_expected3 = np.heaviside(self.input_2, self.input_4) + self.np_expected4 = np.heaviside(self.input_4, self.input_5) + + def test_broadcast(self): + paddle.disable_static() + self.tensor_1 = paddle.to_tensor(self.input_1) + self.tensor_2 = paddle.to_tensor(self.input_2) + self.tensor_3 = paddle.to_tensor(self.input_3) + self.tensor_4 = paddle.to_tensor(self.input_4) + self.tensor_5 = paddle.to_tensor(self.input_5) + + res = paddle.heaviside(self.tensor_1, self.tensor_2) + res = res.numpy() + self.assertTrue(np.allclose(res, self.np_expected1)) + + res = paddle.heaviside(self.tensor_2, self.tensor_3) + res = res.numpy() + self.assertTrue(np.allclose(res, self.np_expected2)) + + res = paddle.heaviside(self.tensor_2, self.tensor_4) + res = res.numpy() + self.assertTrue(np.allclose(res, self.np_expected3)) + + res = paddle.heaviside(self.tensor_4, self.tensor_5) + res = res.numpy() + self.assertTrue(np.allclose(res, self.np_expected4)) + + +class TestHeavisideAPI_float64(unittest.TestCase): + def setUp(self): + self.x_np = np.random.random((13, 17)).astype("float64") + self.y_np = np.random.random((13, 17)).astype("float64") + self.out_np = np.heaviside(self.x_np, self.y_np) + self.dtype = "float64" + + def test_static(self): + for use_cuda in ([False, True] + if paddle.device.is_compiled_with_cuda() else [False]): + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + + paddle.enable_static() + prog = paddle.static.Program() + with paddle.static.program_guard(prog): + x = paddle.static.data( + name=f"x_{self.dtype}", shape=[13, 17], dtype=self.dtype) + y = paddle.static.data( + name=f"y_{self.dtype}", shape=[13, 17], dtype=self.dtype) + out = paddle.heaviside(x, y) + + exe = paddle.static.Executor(place=place) + res = exe.run(prog, + feed={ + f"x_{self.dtype}": self.x_np, + f"y_{self.dtype}": self.y_np + }, + fetch_list=out, + use_prune=True) + + self.assertTrue(np.allclose(res, self.out_np)) + + def test_dygraph(self): + for use_cuda in ([False, True] + if paddle.device.is_compiled_with_cuda() else [False]): + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + paddle.disable_static(place=place) + result = paddle.heaviside( + paddle.to_tensor(self.x_np), paddle.to_tensor(self.y_np)) + + self.assertTrue(np.allclose(result.numpy(), self.out_np)) + + +class TestHeavisideAPI_float32(TestHeavisideAPI_float64): + def setUp(self): + self.x_np = np.random.random((13, 17)).astype("float32") + self.y_np = np.random.random((13, 17)).astype("float32") + self.out_np = np.heaviside(self.x_np, self.y_np) + self.dtype = "float32" + + +class TestHeavisideAPI_int64(TestHeavisideAPI_float64): + def setUp(self): + self.x_np = np.random.random((13, 17)).astype("int64") + self.y_np = np.random.random((13, 17)).astype("int64") + self.out_np = np.heaviside(self.x_np, self.y_np) + self.dtype = "int64" + + +class TestHeavisideAPI_int32(TestHeavisideAPI_float64): + def setUp(self): + self.x_np = np.random.random((13, 17)).astype("int32") + self.y_np = np.random.random((13, 17)).astype("int32") + self.out_np = np.heaviside(self.x_np, self.y_np) + self.dtype = "int32" + + +class TestHeavisideError(unittest.TestCase): + def test_input(self): + paddle.disable_static() + + def test_input_x(): + paddle.heaviside(1, paddle.randn([100])) + + self.assertRaises(ValueError, test_input_x) + + def test_input_y(): + paddle.heaviside(paddle.randn([100]), 1) + + self.assertRaises(ValueError, test_input_y) + + def test_input_xy(): + paddle.heaviside( + paddle.randn([100], 'float32'), paddle.randn([100], 'float64')) + + self.assertRaises(ValueError, test_input_xy) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py index 55f87540c1b8a..d89465c5aecab 100644 --- a/python/paddle/fluid/tests/unittests/test_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py @@ -407,6 +407,10 @@ def test_grad(self): class TestAvgPool2DDoubleGradCheckCase2(unittest.TestCase): + def pool2d_wrapper(self, x): + return paddle.nn.functional.avg_pool2d( + x[0], kernel_size=2, data_format="NHWC") + @prog_scope() def func(self, place): input_NHWC = fluid.layers.data( @@ -416,13 +420,16 @@ def func(self, place): dtype="float32") input_NHWC.persistable = True - y = layers.pool2d( - input_NHWC, pool_size=2, pool_type="avg", data_format="NHWC") + y = paddle.nn.functional.avg_pool2d( + input_NHWC, kernel_size=2, data_format="NHWC") x_arr = np.random.uniform(-1, 1, [2, 5, 5, 3]).astype(np.float32) gradient_checker.double_grad_check( [input_NHWC], y, x_init=x_arr, place=place, eps=0.05) + gradient_checker.double_grad_check_for_dygraph( + self.pool2d_wrapper, [input_NHWC], y, x_init=x_arr, place=place) + def test_grad(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): @@ -432,6 +439,10 @@ def test_grad(self): class TestAvgPool2DDoubleGradCheckCase3(unittest.TestCase): + def pool2d_wrapper(self, x): + return paddle.nn.functional.avg_pool2d( + x[0], kernel_size=2, padding=[1, 1]) + @prog_scope() def func(self, place): input_NCHW = fluid.layers.data( @@ -441,12 +452,14 @@ def func(self, place): dtype="float32") input_NCHW.persistable = True - y = layers.pool2d( - input_NCHW, pool_size=2, pool_type="avg", pool_padding=[1, 1]) + y = paddle.nn.functional.avg_pool2d( + input_NCHW, kernel_size=2, padding=[1, 1]) x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32) gradient_checker.double_grad_check( [input_NCHW], y, x_init=x_arr, place=place, eps=0.05) + gradient_checker.double_grad_check_for_dygraph( + self.pool2d_wrapper, [input_NCHW], y, x_init=x_arr, place=place) def test_grad(self): places = [fluid.CPUPlace()] @@ -457,6 +470,9 @@ def test_grad(self): class TestAvgPool2DDoubleGradCheckCase4(unittest.TestCase): + def pool2d_wrapper(self, x): + return paddle.nn.functional.avg_pool2d(x[0], kernel_size=[4, 4]) + @prog_scope() def func(self, place): input_NCHW = fluid.layers.data( @@ -467,10 +483,13 @@ def func(self, place): input_NCHW.persistable = True y = layers.pool2d(input_NCHW, pool_size=[4, 4], pool_type="avg") + y = paddle.nn.functional.avg_pool2d(input_NCHW, kernel_size=[4, 4]) x_arr = np.random.uniform(-1, 1, [2, 3, 5, 5]).astype(np.float32) gradient_checker.double_grad_check( [input_NCHW], y, x_init=x_arr, place=place, eps=0.05) + gradient_checker.double_grad_check_for_dygraph( + self.pool2d_wrapper, [input_NCHW], y, x_init=x_arr, place=place) def test_grad(self): places = [fluid.CPUPlace()] diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py index b7e8e06029d93..e12d1826f286c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sharding_parallel.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import paddle.fluid as fluid @@ -24,8 +25,12 @@ class TestHybridParallel(TestMultipleGpus): # check sharding logic as well as the accuracy with single mode def test_hybrid_parallel_sharding_logic(self): - self.run_mnist_2gpu('hybrid_parallel_sharding_model.py') + # self.run_mnist_2gpu( + # 'hybrid_parallel_sharding_model.py') + self.run_mnist_2gpu( + 'hybrid_parallel_sharding_model.py', eager_mode=False) if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py index 8f46119d551c6..04772a2da2871 100644 --- a/python/paddle/fluid/tests/unittests/test_pipeline.py +++ b/python/paddle/fluid/tests/unittests/test_pipeline.py @@ -63,7 +63,7 @@ def test_dist_train_one_device(self): "pipeline_mnist_one_device.py", check_error_log=True, log_name=flag_name, - need_envs=self.need_envs()) + need_envs={"PADDLE_MANUAL_PIPELINE_STAGE": "0"}) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py index d5f4cef5b8759..fb1cd35c45380 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py @@ -37,6 +37,7 @@ 'dot', 'elementwise_add', 'elementwise_div', + 'elementwise_heaviside', 'elementwise_max', 'elementwise_min', 'elementwise_mul', diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py new file mode 100644 index 0000000000000..9ef2c093604b0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +sys.path.append("..") +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard + +import paddle +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +class XPUTestElementwiseModOp(XPUOpTestWrapper): + def __init__(self) -> None: + self.op_name = 'elementwise_mod' + self.use_dynamic_create_class = False + + class ElementwiseModOp(XPUOpTest): + def init_kernel_type(self): + self.use_mkldnn = False + + def init_input_output(self): + self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype) + self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype) + self.out = np.mod(self.x, self.y) + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.outputs = {'Out': self.out} + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + + def init_dtype(self): + pass + + def init_axis(self): + pass + + def setUp(self): + self.op_type = 'elementwise_mod' + self.use_xpu = True + self.dtype = self.in_type + self.axis = -1 + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + class TestElementwiseModOp_broadcast_1(ElementwiseModOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.rand(2, 100, 3).astype(self.dtype), + 'Y': np.random.rand(2, 100, 3).astype(self.dtype) + } + + self.attrs = {'axis': 1} + self.outputs = {'Out': self.inputs['X'] % self.inputs['Y']} + + class TestElementwiseModOp_broadcast_2(ElementwiseModOp): + def init_input_output(self): + self.inputs = { + 'X': np.random.rand(22, 128, 3).astype(self.dtype), + 'Y': np.random.rand(22, 128, 3).astype(self.dtype) + } + + self.attrs = {'axis': 1} + self.outputs = {'Out': self.inputs['X'] % self.inputs['Y']} + + class TestRemainderOp(unittest.TestCase): + def test_dygraph(self): + with fluid.dygraph.guard(): + np_x = np.random.rand(22, 128, 3).astype('int64') + np_y = np.random.rand(22, 128, 3).astype('int64') + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = paddle.remainder(x, y) + np_z = z.numpy() + z_expected = np.mod(np_x, np_y) + self.assertEqual((np_z == z_expected).all(), True) + + np_x = np.array([-3.3, 11.5, -2, 3.5]) + np_y = np.array([-1.2, 2., 3.3, -2.3]) + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = x % y + z_expected = np.array([-0.9, 1.5, 1.3, -1.1]) + self.assertEqual(np.allclose(z_expected, z.numpy()), True) + + np_x = np.random.rand(22, 128, 3).astype('int32') + np_y = np.random.rand(22, 128, 3).astype('int32') + x = paddle.to_tensor(np_x) + y = paddle.to_tensor(np_y) + z = paddle.remainder(x, y) + np_z = z.numpy() + z_expected = np.mod(np_x, np_y) + self.assertEqual((np_z == z_expected).all(), True) + + np_x = np.array([-3, 11, -2, 3]) + np_y = np.array([-1, 2, 3, -2]) + x = paddle.to_tensor(np_x, dtype="float16") + y = paddle.to_tensor(np_y, dtype="float16") + z = x % y + z_expected = np.array([0, 1, 1, -1]) + self.assertEqual(np.allclose(z_expected, z.numpy()), True) + + +support_types = get_xpu_op_support_types('elementwise_mod') +for stype in support_types: + create_test_class(globals(), XPUTestElementwiseModOp, stype) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py index 83a569aacc911..7d1bbf8162c2e 100644 --- a/python/paddle/fluid/variable_index.py +++ b/python/paddle/fluid/variable_index.py @@ -306,8 +306,9 @@ def idx_empty(var): return paddle.empty(var_shape, dtype=var.dtype) from .layers.control_flow import cond - return cond(item.any(), lambda: idx_not_empty(var, item), - lambda: idx_empty(var)) + return cond( + paddle.logical_not(item.any()), lambda: idx_empty(var), + lambda: idx_not_empty(var, item)) def _getitem_impl_(var, item): diff --git a/python/paddle/incubate/optimizer/functional/bfgs.py b/python/paddle/incubate/optimizer/functional/bfgs.py index 23fd8dc0825f0..2065b3c1c94c0 100644 --- a/python/paddle/incubate/optimizer/functional/bfgs.py +++ b/python/paddle/incubate/optimizer/functional/bfgs.py @@ -49,16 +49,16 @@ def minimize_bfgs(objective_func, Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp140: Algorithm 6.1 (BFGS Method). Args: - objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar. - initial_position (Tensor): the starting point of the iterates. + objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar. + initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. - initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None. + initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None. line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. - dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'. + dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'. name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. Returns: diff --git a/python/paddle/incubate/optimizer/functional/lbfgs.py b/python/paddle/incubate/optimizer/functional/lbfgs.py index f283381597733..e15ad56dc2d11 100644 --- a/python/paddle/incubate/optimizer/functional/lbfgs.py +++ b/python/paddle/incubate/optimizer/functional/lbfgs.py @@ -50,17 +50,17 @@ def minimize_lbfgs(objective_func, Jorge Nocedal, Stephen J. Wright, Numerical Optimization, Second Edition, 2006. pp179: Algorithm 7.5 (L-BFGS). Args: - objective_func: the objective function to minimize. ``objective_func`` accepts a multivariate input and returns a scalar. - initial_position (Tensor): the starting point of the iterates. + objective_func: the objective function to minimize. ``objective_func`` accepts a 1D Tensor and returns a scalar. + initial_position (Tensor): the starting point of the iterates, has the same shape with the input of ``objective_func`` . history_size (Scalar): the number of stored vector pairs {si,yi}. Default value: 100. max_iters (int, optional): the maximum number of minimization iterations. Default value: 50. tolerance_grad (float, optional): terminates if the gradient norm is smaller than this. Currently gradient norm uses inf norm. Default value: 1e-7. tolerance_change (float, optional): terminates if the change of function value/position/parameter between two iterations is smaller than this value. Default value: 1e-9. - initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. Default value: None. + initial_inverse_hessian_estimate (Tensor, optional): the initial inverse hessian approximation at initial_position. It must be symmetric and positive definite. If not given, will use an identity matrix of order N, which is size of ``initial_position`` . Default value: None. line_search_fn (str, optional): indicate which line search method to use, only support 'strong wolfe' right now. May support 'Hager Zhang' in the futrue. Default value: 'strong wolfe'. max_line_search_iters (int, optional): the maximum number of line search iterations. Default value: 50. initial_step_length (float, optional): step length used in first iteration of line search. different initial_step_length may cause different optimal result. For methods like Newton and quasi-Newton the initial trial step length should always be 1.0. Default value: 1.0. - dtype ('float32' | 'float64', optional): data type used in the algorithm. Default value: 'float32'. + dtype ('float32' | 'float64', optional): data type used in the algorithm, the data type of the input parameter must be consistent with the dtype. Default value: 'float32'. name (str, optional): Name for the operation. For more information, please refer to :ref:`api_guide_Name`. Default value: None. Returns: diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index d6b8d6363690a..834b631e5c519 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -16,6 +16,7 @@ from .tensor.linalg import norm # noqa: F401 from .tensor.linalg import eig # noqa: F401 from .tensor.linalg import cov # noqa: F401 +from .tensor.linalg import corrcoef # noqa: F401 from .tensor.linalg import cond # noqa: F401 from .tensor.linalg import matrix_power # noqa: F401 from .tensor.linalg import solve # noqa: F401 @@ -41,6 +42,7 @@ 'norm', 'cond', 'cov', + 'corrcoef', 'inv', 'eig', 'eigvals', diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py index fba1aeabf28bd..fd75ab9550d52 100644 --- a/python/paddle/profiler/utils.py +++ b/python/paddle/profiler/utils.py @@ -25,9 +25,9 @@ _AllowedEventTypeList = [ TracerEventType.Dataloader, TracerEventType.ProfileStep, - TracerEventType.UserDefined, TracerEventType.Forward, - TracerEventType.Backward, TracerEventType.Optimization, - TracerEventType.PythonOp, TracerEventType.PythonUserDefined + TracerEventType.Forward, TracerEventType.Backward, + TracerEventType.Optimization, TracerEventType.PythonOp, + TracerEventType.PythonUserDefined ] @@ -37,7 +37,7 @@ class RecordEvent(ContextDecorator): Args: name(str): Name of the record event - event_type(TracerEventType, optional): Optional, default value is TracerEventType.UserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. + event_type(TracerEventType, optional): Optional, default value is TracerEventType.PythonUserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. Examples: .. code-block:: python @@ -64,7 +64,7 @@ class RecordEvent(ContextDecorator): def __init__(self, name: str, - event_type: TracerEventType=TracerEventType.UserDefined): + event_type: TracerEventType=TracerEventType.PythonUserDefined): self.name = name self.event_type = event_type self.event = None @@ -101,8 +101,6 @@ def begin(self): can be recorded.".format(*_AllowedEventTypeList)) self.event = None else: - if self.event_type == TracerEventType.UserDefined: - self.event_type == TracerEventType.PythonUserDefined self.event = _RecordEvent(self.name, self.event_type) def end(self): diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 5f0fb4336e014..283bce1cc817f 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -40,6 +40,7 @@ from .linalg import matmul # noqa: F401 from .linalg import dot # noqa: F401 from .linalg import cov # noqa: F401 +from .linalg import corrcoef # noqa: F401 from .linalg import norm # noqa: F401 from .linalg import cond # noqa: F401 from .linalg import transpose # noqa: F401 @@ -228,6 +229,7 @@ from .math import fmin # noqa: F401 from .math import inner # noqa: F401 from .math import outer # noqa: F401 +from .math import heaviside # noqa: F401 from .math import frac # noqa: F401 from .random import multinomial # noqa: F401 @@ -278,6 +280,7 @@ 'matmul', 'dot', 'cov', + 'corrcoef', 'norm', 'cond', 'transpose', @@ -493,6 +496,7 @@ 'put_along_axis', 'put_along_axis_', 'exponential_', + 'heaviside', ] #this list used in math_op_patch.py for magic_method bind diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 2c1732ad62848..2a77dbd115733 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -24,6 +24,7 @@ from .creation import full import paddle +import warnings from paddle.common_ops_import import core from paddle.common_ops_import import VarDesc from paddle import _C_ops @@ -3181,3 +3182,72 @@ def lstsq(x, y, rcond=None, driver=None, name=None): singular_values = paddle.static.data(name='singular_values', shape=[0]) return solution, residuals, rank, singular_values + + +def corrcoef(x, rowvar=True, name=None): + """ + + A correlation coefficient matrix indicate the correlation of each pair variables in the input matrix. + For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the correlation coefficient matrix + element Rij is the correlation of xi and xj. The element Rii is the covariance of xi itself. + + The relationship between the correlation coefficient matrix `R` and the + covariance matrix `C`, is + + .. math:: R_{ij} = \\frac{ C_{ij} } { \\sqrt{ C_{ii} * C_{jj} } } + + The values of `R` are between -1 and 1. + + Parameters: + + x(Tensor): A N-D(N<=2) Tensor containing multiple variables and observations. By default, each row of x represents a variable. Also see rowvar below. + rowvar(Bool, optional): If rowvar is True (default), then each row represents a variable, with observations in the columns. Default: True. + name(str, optional): Name of the output. Default is None. It's used to print debug info for developers. Details: :ref:`api_guide_Name`. + + Returns: + + The correlation coefficient matrix of the variables. + + Examples: + .. code-block:: python + :name: code-example1 + + import paddle + + xt = paddle.rand((3,4)) + print(paddle.linalg.corrcoef(xt)) + + # Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=True, + # [[ 1. , -0.73702252, 0.66228950], + # [-0.73702258, 1. , -0.77104872], + # [ 0.66228974, -0.77104825, 1. ]]) + + """ + if len(x.shape) > 2 or len(x.shape) < 1: + raise ValueError( + "Input(x) only support N-D (1<=N<=2) tensor in corrcoef, but received " + "length of Input(input) is %s." % len(x.shape)) + check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'corrcoef') + + c = cov(x, rowvar) + if (c.ndim == 0): + # scalar covariance + # nan if incorrect value (nan, inf, 0), 1 otherwise + return c / c + + d = paddle.diag(c) + + if paddle.is_complex(d): + d = d.real() + stddev = paddle.sqrt(d) + c /= stddev[:, None] + c /= stddev[None, :] + + # Clip to [-1, 1]. This does not guarantee + if paddle.is_complex(c): + return paddle.complex( + paddle.clip(c.real(), -1, 1), paddle.clip(c.imag(), -1, 1)) + else: + c = paddle.clip(c, -1, 1) + + return c diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 83501b0399492..5ee372f7b956a 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -4381,6 +4381,54 @@ def angle(x, name=None): helper.append_op(type=op_type, inputs=inputs, outputs=outputs) return out +def heaviside(x, y, name=None): + """ + Computes the Heaviside step function determined by corresponding element in y for each element in x. The equation is + + .. math:: + heaviside(x, y)= + \left\{ + \\begin{array}{lcl} + 0,& &\\text{if} \ x < 0, \\\\ + y,& &\\text{if} \ x = 0, \\\\ + 1,& &\\text{if} \ x > 0. + \end{array} + \\right. + + Notes: + ``paddle.heaviside`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. + + Args: + x (Tensor): The input tensor of Heaviside step function, it's data type should be float32, float64, int32 or int64. + y (Tensor): The tensor that determines a Heaviside step function, it's data type should be float32, float64, int32 or int64. + name (str, optional): Name for the operation (optional, default is None). Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. + + Returns: + N-D Tensor. A location into which the result is stored. If x and y have different shapes and are broadcastable, the resulting tensor shape is the shape of x and y after broadcasting. If x, y have the same shape, its shape is the same as x and y. + + Examples: + .. code-block:: python + :name: heaviside-example + + import paddle + x = paddle.to_tensor([-0.5, 0, 0.5]) + y = paddle.to_tensor([0.1]) + paddle.heaviside(x, y) + # [0. , 0.10000000, 1. ] + x = paddle.to_tensor([[-0.5, 0, 0.5], [-0.5, 0.5, 0]]) + y = paddle.to_tensor([0.1, 0.2, 0.3]) + paddle.heaviside(x, y) + # [[0. , 0.20000000, 1. ], + # [0. , 1. , 0.30000001]] + """ + op_type = 'elementwise_heaviside' + axis = -1 + act = None + if _non_static_mode(): + return _elementwise_op_in_dygraph( + x, y, axis=axis, act=act, op_name=op_type) + return _elementwise_op(LayerHelper(op_type, **locals())) + def frac(x, name=None): """ This API is used to return the fractional portion of each element in input. diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index e044447f87c22..3de9e323c2ed9 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1,3 +1,15 @@ +- backward_api : abs_double_grad + forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x) + args : (Tensor x, Tensor grad_x_grad) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : abs_double_grad + data_transform: + skip_transform : grad_x_grad + - backward_api : abs_grad forward : abs (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -9,6 +21,7 @@ func : abs_grad data_transform: skip_transform : out_grad + backward : abs_double_grad - backward_api : acos_grad forward : acos (Tensor x) -> Tensor(out) @@ -567,6 +580,7 @@ param : [x] kernel : func : expand_grad + no_need_buffer : x - backward_api : expm1_grad forward : expm1 (Tensor x) -> Tensor(out) @@ -820,6 +834,7 @@ kernel : func : layer_norm_grad data_type : out_grad + no_need_buffer : bias optional : scale, bias - backward_api : leaky_relu_double_grad @@ -1260,6 +1275,7 @@ param: [x] kernel : func : pad3d_grad + no_need_buffer : x - backward_api : pixel_shuffle_grad forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out) @@ -1280,6 +1296,16 @@ kernel : func : poisson_grad +- backward_api : pool2d_double_grad + forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(grad_x) + args : (Tensor grad_x_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) + output : Tensor(grad_out_grad) + infer_meta : + func : PoolInferMeta + kernel : + func : pool2d_double_grad + use_gpudnn : true + - backward_api : pool2d_grad forward : pool2d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) @@ -1289,6 +1315,7 @@ kernel : func : pool2d_grad use_gpudnn : true + backward : pool2d_double_grad - backward_api : pool2d_grad_gpudnn_unused forward : pool2d_gpudnn_unused(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) @@ -1409,6 +1436,7 @@ param : [grad_out] kernel : func : reshape_double_grad + no_need_buffer : grad_out - backward_api : reshape_grad forward : reshape_with_xshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape) @@ -1435,6 +1463,7 @@ kernel : func : roi_align_grad data_type : boxes + no_need_buffer : x optional : boxes_num - backward_api : roi_pool_grad @@ -1749,6 +1778,7 @@ param : [x] kernel : func : sum_grad + no_need_buffer : x backward : sum_double_grad - backward_api : sum_triple_grad diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py index 5a8c2cc09f884..29a857ba570f6 100644 --- a/python/paddle/vision/transforms/functional.py +++ b/python/paddle/vision/transforms/functional.py @@ -714,9 +714,33 @@ def erase(img, i, j, h, w, v, inplace=False): import paddle - fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32) + fake_img = paddle.randn((3, 2, 4)).astype(paddle.float32) + print(fake_img) + + #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[[ 0.02169025, -0.97859967, -1.39175487, -1.07478464], + # [ 0.20654772, 1.74624777, 0.32268861, -0.13857445]], + # + # [[-0.14993843, 1.10793507, -0.40056887, -1.94395220], + # [ 0.41686651, 0.44551995, -0.09356714, -0.60898107]], + # + # [[-0.24998808, -1.47699273, -0.88838995, 0.42629015], + # [ 0.56948012, -0.96200180, 0.53355658, 3.20450878]]]) + values = paddle.zeros((1,1,1), dtype=paddle.float32) - result = paddle.vision.transforms.erase(fake_img, 4, 4, 3, 3, values) + result = paddle.vision.transforms.erase(fake_img, 0, 1, 1, 2, values) + + print(result) + + #Tensor(shape=[3, 2, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[[ 0.02169025, 0. , 0. , -1.07478464], + # [ 0.20654772, 1.74624777, 0.32268861, -0.13857445]], + # + # [[-0.14993843, 0. , 0. , -1.94395220], + # [ 0.41686651, 0.44551995, -0.09356714, -0.60898107]], + # + # [[-0.24998808, 0. , 0. , 0.42629015], + # [ 0.56948012, -0.96200180, 0.53355658, 3.20450878]]]) """ if _is_tensor_image(img): diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py index 828a0d9b0936d..ce356449c594e 100644 --- a/python/paddle/vision/transforms/transforms.py +++ b/python/paddle/vision/transforms/transforms.py @@ -1377,7 +1377,9 @@ class RandomErasing(BaseTransform): fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32) transform = paddle.vision.transforms.RandomErasing() - result = transform(fake_img) + result = transform(fake_img) + + print(result) """ def __init__(self, diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt index 9165764adcaf4..ffc4fde7c27d1 100644 --- a/python/unittest_py/requirements.txt +++ b/python/unittest_py/requirements.txt @@ -1,5 +1,5 @@ PyGithub -coverage +coverage==5.5 pycrypto ; platform_system != "Windows" mock gym @@ -11,4 +11,4 @@ paddle2onnx>=0.8.2 scipy>=1.6 prettytable distro -numpy>=1.20 +numpy>=1.20,<1.22 diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 630005bccbaf7..45d4731ba1dba 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -20,7 +20,7 @@ if [ -z ${BRANCH} ]; then fi PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )" -approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` +approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` failed_num=0 echo_list=() diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index b2d2e792c995b..b0800a9cd845e 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -71,7 +71,7 @@ API_FILES=("CMakeLists.txt" "paddle/fluid/eager/backward.h" ) -approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` +approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` git_files=`git diff --numstat upstream/$BRANCH| wc -l` git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'` failed_num=0 diff --git a/tools/check_ut.py b/tools/check_ut.py index fa50f5cc81f13..f5fe4c687dd78 100644 --- a/tools/check_ut.py +++ b/tools/check_ut.py @@ -24,7 +24,7 @@ class PRChecker(object): """ PR Checker. """ def __init__(self): - self.github = Github(timeout=60) + self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60) self.repo = None def check(self, filename, msg): diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 878660cefaf21..8e84eccc083f2 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -307,7 +307,7 @@ function gpu_op_benchmark { # The PR will pass quickly when get approval from specific person. # Xreki 12538138, luotao1 6836917, ZzSean 32410583 set +x -approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) +approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ -n "${approval_line}" ]; then APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917) LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" diff --git a/tools/coverage/cuda_clean.py b/tools/coverage/cuda_clean.py index 28142c869d04c..8c03edd078549 100644 --- a/tools/coverage/cuda_clean.py +++ b/tools/coverage/cuda_clean.py @@ -30,7 +30,8 @@ def get_pull(pull_id): Returns: github.PullRequest.PullRequest: The pull request. """ - github = Github(timeout=60) + token = os.getenv('GITHUB_API_TOKEN') + github = Github(token, timeout=60) repo = github.get_repo('PaddlePaddle/Paddle') pull = repo.get_pull(pull_id) diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py index 33d9a8f6c78a3..12bd04a6907ea 100644 --- a/tools/coverage/gcda_clean.py +++ b/tools/coverage/gcda_clean.py @@ -32,7 +32,8 @@ def get_pull(pull_id): Returns: github.PullRequest.PullRequest """ - github = Github(timeout=60) + token = os.getenv('GITHUB_API_TOKEN') + github = Github(token, timeout=60) idx = 1 while idx < 4: try: diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py index 20399f1c2e630..f3e88286ca965 100644 --- a/tools/coverage/pull_request.py +++ b/tools/coverage/pull_request.py @@ -24,6 +24,8 @@ from github import Github +token = os.getenv('GITHUB_API_TOKEN') + def get_pull(pull_id): """ @@ -33,7 +35,7 @@ def get_pull(pull_id): Returns: github.PullRequest.PullRequest """ - github = Github(timeout=60) + github = Github(token, timeout=60) repo = github.get_repo('PaddlePaddle/Paddle') pull = repo.get_pull(pull_id) diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos index 900ca9b7a9701..e5b1137090f0e 100644 --- a/tools/dockerfile/Dockerfile.centos +++ b/tools/dockerfile/Dockerfile.centos @@ -65,7 +65,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_i LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' -RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \ +RUN wget -O /opt/swig-2.0.12.tar.gz --no-check-certificate https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \ cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz # ccache 3.7.9 diff --git a/tools/dockerfile/Dockerfile.release16 b/tools/dockerfile/Dockerfile.release16 index 7effa2e4ed5e8..66974f46d91e4 100644 --- a/tools/dockerfile/Dockerfile.release16 +++ b/tools/dockerfile/Dockerfile.release16 @@ -46,13 +46,14 @@ ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH # Install Python3.7 -RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ +RUN mkdir -p /root/python_build/ && wget -q https://paddle-ci.gz.bcebos.com/sqlite-autoconf-3250300.tar.gz && \ tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7.0 --enable-shared > /dev/null && \ - make -j8 > /dev/null && make altinstall > /dev/null && ldconfig + make -j8 > /dev/null && make altinstall > /dev/null && ldconfig && \ + cd ../ && rm -rf Python-3.7.0 Python-3.7.0.tgz ENV PATH=/usr/local/python3.7.0/include:${PATH} ENV PATH=/usr/local/python3.7.0/bin:${PATH} @@ -79,7 +80,7 @@ RUN rm setuptools-50.3.2.zip pip-20.0.1.tar.gz && \ # Install Go and glide WORKDIR /home -RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \ +RUN wget -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.linux-amd64.tar.gz | \ tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ @@ -146,7 +147,8 @@ RUN apt-get install libprotobuf-dev -y # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa # So install a newer version here. RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb + dpkg -i patchelf_0.10-2_amd64.deb && \ + rm -rf patchelf_0.10-2_amd64.deb # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config @@ -157,7 +159,8 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ ./configure -prefix=/usr/local/ccache-3.7.9 && \ make -j8 && make install && \ - ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache + ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ + cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz EXPOSE 22 diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18 index b7e13bb2a3e86..d646f41b00d0b 100644 --- a/tools/dockerfile/Dockerfile.release18 +++ b/tools/dockerfile/Dockerfile.release18 @@ -1,7 +1,7 @@ # A image for building paddle binaries # Use cuda devel base image for both cpu and gpu environment # When you modify it, please be aware of cudnn-runtime version -FROM nvidia/cuda: +FROM MAINTAINER PaddlePaddle Authors # ENV variables @@ -11,7 +11,7 @@ ARG WITH_AVX ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} ENV DEBIAN_FRONTEND=noninteractive -ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:$LD_LIBRARY_PATH + ENV HOME /root # Add bash enhancements @@ -23,6 +23,7 @@ RUN apt-get update && \ apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \ coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev libgl1-mesa-glx \ bison graphviz libjpeg-dev zlib1g-dev automake locales swig net-tools libtool module-init-tools + # Downgrade gcc&&g++ WORKDIR /usr/bin @@ -72,7 +73,7 @@ RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ # Install Go and glide -RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \ +RUN wget -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.linux-amd64.tar.gz | \ tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ @@ -103,7 +104,8 @@ RUN pip3.7 --no-cache-dir install -r /root/requirements.txt # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa # So install a newer version here. RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb + dpkg -i patchelf_0.10-2_amd64.deb && \ + rm -rf patchelf_0.10-2_amd64.deb # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config @@ -114,7 +116,8 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ ./configure -prefix=/usr/local/ccache-3.7.9 && \ make -j8 && make install && \ - ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache + ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ + cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz # clang-form 3.8.0 RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu index df863cd893c19..000e7098570bb 100644 --- a/tools/dockerfile/Dockerfile.ubuntu +++ b/tools/dockerfile/Dockerfile.ubuntu @@ -15,8 +15,9 @@ ENV HOME /root # Add bash enhancements COPY paddle/scripts/docker/root/ /root/ +RUN chmod 777 /tmp # Prepare packages for Python -RUN apt-get update && \ +RUN apt-get update --allow-unauthenticated && \ apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ xz-utils tk-dev libffi-dev liblzma-dev @@ -45,7 +46,7 @@ RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && ta ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH # Install Python3.6 -RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ +RUN mkdir -p /root/python_build/ && wget -q https://paddle-ci.gz.bcebos.com/sqlite-autoconf-3250300.tar.gz && \ tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ @@ -123,7 +124,7 @@ RUN rm Python-$version.tgz setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0. # Install Go and glide WORKDIR /home -RUN wget -qO- https://paddle-ci.gz.bcebos.com/go1.15.12.linux-amd64.tar.gz | \ +RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.linux-amd64.tar.gz | \ tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ @@ -199,8 +200,7 @@ COPY ./python/requirements.txt /root/ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \ pip3.7 --no-cache-dir install -r /root/requirements.txt && \ pip3.8 --no-cache-dir install -r /root/requirements.txt && \ - pip3.9 --no-cache-dir install -r /root/requirements.txt && \ - pip --no-cache-dir install -r /root/requirements.txt + pip3.9 --no-cache-dir install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 @@ -229,7 +229,8 @@ RUN apt-get install libprotobuf-dev -y # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa # So install a newer version here. RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb + dpkg -i patchelf_0.10-2_amd64.deb && \ + rm -rf patchelf_0.10-2_amd64.deb # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config @@ -240,7 +241,8 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ ./configure -prefix=/usr/local/ccache-3.7.9 && \ make -j8 && make install && \ - ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache + ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ + cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz EXPOSE 22 diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18 index a4a445e6db214..57c042f127a54 100644 --- a/tools/dockerfile/Dockerfile.ubuntu18 +++ b/tools/dockerfile/Dockerfile.ubuntu18 @@ -1,7 +1,7 @@ # A image for building paddle binaries # Use cuda devel base image for both cpu and gpu environment # When you modify it, please be aware of cudnn-runtime version -FROM nvidia/cuda: +FROM MAINTAINER PaddlePaddle Authors # ENV variables @@ -11,18 +11,20 @@ ARG WITH_AVX ENV WITH_GPU=${WITH_GPU:-ON} ENV WITH_AVX=${WITH_AVX:-ON} ENV DEBIAN_FRONTEND=noninteractive -ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:$LD_LIBRARY_PATH + ENV HOME /root # Add bash enhancements COPY paddle/scripts/docker/root/ /root/ -RUN apt-get update && \ +RUN chmod 777 /tmp +RUN apt-get update --allow-unauthenticated && \ apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \ apt-get update && \ apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip \ coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev libgl1-mesa-glx \ bison graphviz libjpeg-dev zlib1g-dev automake locales swig net-tools libtool module-init-tools + # Downgrade gcc&&g++ WORKDIR /usr/bin @@ -49,7 +51,7 @@ RUN apt-get update && \ python3.7 python3.7-dev \ python3.8 python3.8-dev python3.8-distutils \ python3.9 python3.9-dev python3.9-distutils && \ - rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \ + rm /usr/bin/python && ln -s /usr/bin/python3.7 /usr/bin/python && \ rm /usr/bin/python3 && ln -s /usr/bin/python3.7 /usr/bin/python3 @@ -63,11 +65,11 @@ RUN python3.9 setup.py build && python3.9 setup.py install && \ WORKDIR /home RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip && apt-get -y install unzip && unzip setuptools-40.6.2.zip WORKDIR /home/setuptools-40.6.2 -RUN python setup.py build && python setup.py install +RUN python2.7 setup.py build && python2.7 setup.py install WORKDIR /home RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz WORKDIR pip-20.0.1 -RUN python setup.py install && \ +RUN python2.7 setup.py install && \ python3.9 setup.py install && \ python3.8 setup.py install && \ python3.7 setup.py install && \ @@ -76,7 +78,7 @@ RUN python setup.py install && \ WORKDIR /home RUN rm setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \ rm -r setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1 -RUN rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \ +RUN rm /usr/local/bin/pip && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip && \ rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3 @@ -88,7 +90,7 @@ RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ # Install Go and glide -RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \ +RUN wget --no-check-certificate -qO- https://paddle-ci.gz.bcebos.com/go1.17.2.linux-amd64.tar.gz | \ tar -xz -C /usr/local && \ mkdir /root/gopath && \ mkdir /root/gopath/bin && \ @@ -113,29 +115,29 @@ RUN pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \ pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \ pip3.9 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \ pip3.9 --no-cache-dir install ipykernel==4.6.0 wheel && \ - pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \ - pip --no-cache-dir install ipykernel==4.6.0 wheel + pip2.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \ + pip2.7 --no-cache-dir install ipykernel==4.6.0 wheel #For docstring checker RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \ pip3.7 --no-cache-dir install pylint pytest astroid isort && \ pip3.8 --no-cache-dir install pylint pytest astroid isort && \ pip3.9 --no-cache-dir install pylint pytest astroid isort && \ - pip --no-cache-dir install pylint pytest astroid isort + pip2.7 --no-cache-dir install pylint pytest astroid isort COPY ./python/requirements.txt /root/ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \ pip3.7 --no-cache-dir install -r /root/requirements.txt && \ pip3.8 --no-cache-dir install -r /root/requirements.txt && \ - pip3.9 --no-cache-dir install -r /root/requirements.txt && \ - pip --no-cache-dir install -r /root/requirements.txt + pip3.9 --no-cache-dir install -r /root/requirements.txt # Older versions of patchelf limited the size of the files being processed and were fixed in this pr. # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa # So install a newer version here. RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \ - dpkg -i patchelf_0.10-2_amd64.deb + dpkg -i patchelf_0.10-2_amd64.deb && \ + rm -rf patchelf_0.10-2_amd64.deb # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config @@ -146,7 +148,8 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ ./configure -prefix=/usr/local/ccache-3.7.9 && \ make -j8 && make install && \ - ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache + ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \ + cd ../ && rm -rf ccache-3.7.9 ccache-3.7.9.tar.gz # clang-form 3.8.0 RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh index 393bd045fb7f8..92d1c12d2bc41 100644 --- a/tools/dockerfile/build_scripts/build.sh +++ b/tools/dockerfile/build_scripts/build.sh @@ -149,7 +149,7 @@ LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}" # According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html # we should install new version ar with 64-bit supported here -wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz +wget --no-check-certificate https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz tar xzf binutils-2.27.tar.gz && cd binutils-2.27 ./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install cd .. && rm binutils-2.27.tar.gz && rm -rf binutils-2.27 diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh index 18dda5be460d9..a714fd11ad439 100755 --- a/tools/dockerfile/build_scripts/build_utils.sh +++ b/tools/dockerfile/build_scripts/build_utils.sh @@ -66,7 +66,7 @@ function do_cpython_build { # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6 if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.6) ]; then - wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz + wget -q --no-check-certificate https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz tar -zxf sqlite-autoconf-3250300.tar.gz cd sqlite-autoconf-3250300 ./configure --prefix=/usr/local diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh index 0edd09a99ecb4..d18dd7a301dc1 100644 --- a/tools/dockerfile/build_scripts/install_gcc.sh +++ b/tools/dockerfile/build_scripts/install_gcc.sh @@ -38,7 +38,7 @@ if [ "$1" == "gcc82" ]; then cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \ ../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \ make -j8 && make install - cd .. && rm -rf temp_gcc82 + cd .. && rm -rf temp_gcc82 gcc-8.2.0 gcc-8.2.0.tar.xz cp ${lib_so_6} ${lib_so_6}.bak && rm -f ${lib_so_6} && ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \ ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \ @@ -52,7 +52,7 @@ elif [ "$1" == "gcc54" ]; then cd .. && mkdir temp_gcc54 && cd temp_gcc54 && \ ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \ make -j8 && make install - cd .. && rm -rf temp_gcc54 + cd .. && rm -rf temp_gcc54 gcc-5.4.0 gcc-5.4.0.tar.gz cp ${lib_so_6} ${lib_so_6}.bak && rm -f ${lib_so_6} && ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \ ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \ diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh index 07f186f3d4e8c..6d44dbb90542f 100644 --- a/tools/dockerfile/build_scripts/install_nccl2.sh +++ b/tools/dockerfile/build_scripts/install_nccl2.sh @@ -17,7 +17,7 @@ VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //") if [ "$VERSION" == "10.0" ]; then DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb" -elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ]; then +elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ] || [ "$VERSION" == "11.3" ] || [ "$VERSION" == "11.4" ] || [ "$VERSION" == "11.5" ] || [ "$VERSION" == "11.6" ]; then if [ -f "/etc/redhat-release" ];then rm -f /usr/local/lib/libnccl.so wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh index 6038e464097cd..47f64ec340b6d 100755 --- a/tools/dockerfile/centos7_manylinux.sh +++ b/tools/dockerfile/centos7_manylinux.sh @@ -100,6 +100,26 @@ function make_cuda112cudnn821trt8034gcc54() { sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp } +function make_cuda113cudnn8() { + sed 's//11.3.1-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp +} + +function make_cuda114cudnn8() { + sed 's//11.4.3-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp +} + +function make_cuda115cudnn8() { + sed 's//11.5.1-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp +} + +function make_cuda116cudnn8() { + sed 's//11.6.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp +} + function main() { local CMD=$1 case $CMD in @@ -145,6 +165,18 @@ function main() { cuda112cudnn821trt8034gcc54) make_cuda112cudnn821trt8034gcc54 ;; + cuda113cudnn8) + make_cuda113cudnn8 + ;; + cuda114cudnn8) + make_cuda114cudnn8 + ;; + cuda115cudnn8) + make_cuda115cudnn8 + ;; + cuda116cudnn8) + make_cuda116cudnn8 + ;; *) echo "Make dockerfile error, Without this paramet." exit 1 diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh index fd814b990161d..ed13ca8762500 100644 --- a/tools/dockerfile/ci_dockerfile.sh +++ b/tools/dockerfile/ci_dockerfile.sh @@ -21,7 +21,7 @@ function make_ubuntu_dockerfile(){ dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}') sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \ tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} - sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \ + sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \ tar -xvf git-2.17.1.tar.gz \&\& \ cd git-2.17.1 \&\& \ ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \ @@ -61,7 +61,9 @@ function make_centos_dockerfile(){ function make_cinn_dockerfile(){ dockerfile_name="Dockerfile.cuda11_cudnn8_gcc82_ubuntu18_cinn" - sed "s//11.2.0-cudnn8-devel-ubuntu18.04/g" ./Dockerfile.ubuntu18 >${dockerfile_name} + sed "s##nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04#g" ./Dockerfile.ubuntu18 >${dockerfile_name} + sed -i "s##ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" ${dockerfile_name} + sed -i 's###g' ${dockerfile_name} sed -i "7i ENV TZ=Asia/Beijing" ${dockerfile_name} sed -i "8i RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone" ${dockerfile_name} sed -i "9i RUN apt-get update && apt-get install -y liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev" ${dockerfile_name} diff --git a/tools/dockerfile/ubuntu16_release.sh b/tools/dockerfile/ubuntu16_release.sh index 7e93bb34f9e31..27e69b68ab8eb 100755 --- a/tools/dockerfile/ubuntu16_release.sh +++ b/tools/dockerfile/ubuntu16_release.sh @@ -63,9 +63,9 @@ function ref_whl(){ elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp37-cp37m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION/-/}${ref_version}-cp37-cp37m-linux_x86_64.whl else - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION/-/}-cp37-cp37m-linux_x86_64.whl fi } diff --git a/tools/dockerfile/ubuntu18_dev.sh b/tools/dockerfile/ubuntu18_dev.sh index c72243ef0521e..cbe09243d7803 100755 --- a/tools/dockerfile/ubuntu18_dev.sh +++ b/tools/dockerfile/ubuntu18_dev.sh @@ -42,6 +42,8 @@ function ref_whl(){ ref_version=.post110 elif [[ ${ref_CUDA_MAJOR} == "11.2" ]];then ref_version=.post112 + elif [[ ${ref_CUDA_MAJOR} == "11.4" ]];then + ref_version=.post114 elif [[ ${ref_CUDA_MAJOR} == "10" ]];then ref_version=.post100 elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then @@ -96,6 +98,25 @@ function install_whl(){ } +function set_cuda_env(){ + if [[ ${WITH_GPU} == "ON" ]]; then + sed -i "s##ENV LD_LIBRARY_PATH=/usr/local/cuda-${ref_CUDA_MAJOR}/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" Dockerfile.tmp + else + sed -i 's###g' Dockerfile.tmp + fi +} + + +function install_package_for_cpu(){ + if [[ ${WITH_GPU} != "ON" ]]; then + sed -i 's##RUN apt-get update \ + RUN apt install -y make gcc g++ #g' Dockerfile.tmp + else + sed -i 's###g' Dockerfile.tmp + fi +} + + function install_gcc(){ if [ "${gcc_version}" == "8.2.0" ];then sed -i 's##WORKDIR /usr/bin \ @@ -118,12 +139,18 @@ function install_gcc(){ function make_dockerfile(){ - sed "s//${docker_name}/g" tools/dockerfile/Dockerfile.ubuntu18 >Dockerfile.tmp + if [[ ${WITH_GPU} == "ON" ]]; then + sed "s##nvidia/cuda:${docker_name}#g" tools/dockerfile/Dockerfile.ubuntu18 >Dockerfile.tmp + else + sed "s##${docker_name}#g" tools/dockerfile/Dockerfile.ubuntu18 >Dockerfile.tmp + fi } function main(){ make_dockerfile + set_cuda_env + install_package_for_cpu install_gcc ref_whl install_whl diff --git a/tools/dockerfile/ubuntu18_release.sh b/tools/dockerfile/ubuntu18_release.sh index 286cb9c6919a1..2c12d4b74c073 100755 --- a/tools/dockerfile/ubuntu18_release.sh +++ b/tools/dockerfile/ubuntu18_release.sh @@ -42,6 +42,8 @@ function ref_whl(){ ref_version=.post110 elif [[ ${ref_CUDA_MAJOR} == "11.2" ]];then ref_version=.post112 + elif [[ ${ref_CUDA_MAJOR} == "11.4" ]];then + ref_version=.post114 elif [[ ${ref_CUDA_MAJOR} == "10" ]];then ref_version=.post100 elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then @@ -63,9 +65,9 @@ function ref_whl(){ elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp37-cp37m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION/-/}${ref_version}-cp37-cp37m-linux_x86_64.whl else - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION/-/}-cp37-cp37m-linux_x86_64.whl fi } @@ -76,6 +78,25 @@ function install_whl(){ } +function set_cuda_env(){ + if [[ ${WITH_GPU} == "ON" ]]; then + sed -i "s##ENV LD_LIBRARY_PATH=/usr/local/cuda-${ref_CUDA_MAJOR}/targets/x86_64-linux/lib:\$LD_LIBRARY_PATH #g" Dockerfile.tmp + else + sed -i 's###g' Dockerfile.tmp + fi +} + + +function install_package_for_cpu(){ + if [[ ${WITH_GPU} != "ON" ]]; then + sed -i 's##RUN apt-get update \ + RUN apt install -y make gcc g++ #g' Dockerfile.tmp + else + sed -i 's###g' Dockerfile.tmp + fi +} + + function install_gcc(){ if [ "${gcc_version}" == "8.2.0" ];then sed -i 's##WORKDIR /usr/bin \ @@ -96,12 +117,18 @@ function install_gcc(){ function make_dockerfile(){ - sed "s//${docker_name}/g" tools/dockerfile/Dockerfile.release18 >Dockerfile.tmp + if [[ ${WITH_GPU} == "ON" ]]; then + sed "s##nvidia/cuda:${docker_name}#g" tools/dockerfile/Dockerfile.release18 >Dockerfile.tmp + else + sed "s##${docker_name}#g" tools/dockerfile/Dockerfile.release18 >Dockerfile.tmp + fi } function main(){ make_dockerfile + set_cuda_env + install_package_for_cpu install_gcc ref_whl install_whl diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 799f80f139c9c..6b90a656f0107 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -35,7 +35,7 @@ class PRChecker(object): """ PR Checker. """ def __init__(self): - self.github = Github(timeout=60) + self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60) self.repo = self.github.get_repo('PaddlePaddle/Paddle') self.py_prog_oneline = re.compile('\d+\|\s*#.*') self.py_prog_multiline_a = re.compile('\d+\|\s*r?""".*?"""', re.DOTALL) diff --git a/tools/get_ut_mem_map.py b/tools/get_ut_mem_map.py index daf80597d3ad0..745d7f9a90c24 100644 --- a/tools/get_ut_mem_map.py +++ b/tools/get_ut_mem_map.py @@ -34,8 +34,8 @@ def get_ut_mem(rootPath): if '[Memory Usage (Byte)] gpu' in line: mem_reserved = round( float( - line.split('[max memory reserved] gpu')[1].split( - ':')[1].split('\\n')[0].strip()), 2) + line.split(' : Reserved = ')[1].split( + ', Allocated = ')[0]), 2) if mem_reserved > mem_reserved1: mem_reserved1 = mem_reserved if 'MAX_GPU_MEMORY_USE=' in line: diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index aaa667595f94c..5070ea2ef06a3 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -170,6 +170,7 @@ 'test_elementwise_div_op', 'test_elementwise_floordiv_op', 'test_elementwise_gradient_op', + 'test_elementwise_heaviside_op', 'test_elementwise_max_op', 'test_elementwise_min_op', 'test_elementwise_mod_op', @@ -658,6 +659,7 @@ 'test_mkldnn_matmul_transpose_reshape_fuse_pass', 'test_mkldnn_scale_matmul_fuse_pass', 'test_mkldnn_inplace_fuse_pass', + 'test_mkldnn_conv_affine_channel_fuse_pass', 'test_batch_fc_op', 'test_c_comm_init_all_op', 'test_conv2d_fusion_op', diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index 0b2fff045ff3c..bf70d8bc3a495 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -319,7 +319,7 @@ function gpu_op_benchmark { # The PR will pass quickly when get approval from specific person. # Xreki 12538138, luotao1 6836917, ZzSean 32410583 set +x -approval_line=$(curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) +approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ -n "${approval_line}" ]; then APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917) LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" diff --git a/tools/test_runner.py b/tools/test_runner.py index 7ceed18634a87..02d926914f904 100644 --- a/tools/test_runner.py +++ b/tools/test_runner.py @@ -32,6 +32,7 @@ def main(): if core.is_compiled_with_cuda() or core.is_compiled_with_rocm(): if (os.getenv('FLAGS_enable_gpu_memory_usage_log') == None): os.environ['FLAGS_enable_gpu_memory_usage_log'] = 'true' + os.environ['FLAGS_enable_gpu_memory_usage_log_mb'] = 'false' some_test_failed = False for module_name in sys.argv[1:]: diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh index 136e21e60415f..576f0e5d238ab 100644 --- a/tools/windows/check_change_of_unittest.sh +++ b/tools/windows/check_change_of_unittest.sh @@ -15,15 +15,16 @@ set -e set +x export PADDLE_ROOT="$(cd "$PWD/../" && pwd )" +GITHUB_API_TOKEN=$GITHUB_API_TOKEN GIT_PR_ID=$AGILE_PULL_ID BRANCH=$BRANCH -if [ "${GIT_PR_ID}" == "" ];then +if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then exit 0 fi unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g') if [ "$unittest_spec_diff" != "" ]; then - approval_line=`curl https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955` echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "FALSE" ]; then